1 Notes

This report was generated on 2021-07-27 15:00:49. R version: 4.1.0 on x86_64-pc-linux-gnu. For this report, CRAN packages as of 2021-01-01 were used.

…

1.1 R-Script & data

The preprocessing and analysis of the data was conducted in the R project for statistical computing. The RMarkdown script used to generate this document and all the resulting data can be downloaded under this link. Through executing main.Rmd, the herein described process can be reproduced and this document can be generated. In the course of this, data from the folder input will be processed and results will be written to output. The html on-line version of the analysis can be accessed through this link.

1.2 GitHub

The code for the herein described process can also be freely downloaded from https://github.com/fernandomillanvillalobos/rddj-template.

1.3 License

…

1.4 Data description of output files

1.4.0.1 abc.csv (Example)

Attribute Type Description
a Numeric …
b Numeric …
c Numeric …

1.4.0.2 xyz.csv

…

2 Set up

## [1] "package package:rstudioapi detached"
## [1] "package package:knitr detached"

2.1 Define packages

# from https://mran.revolutionanalytics.com/web/packages/\
# checkpoint/vignettes/using-checkpoint-with-knitr.html
# if you don't need a package, remove it from here (commenting not sufficient)
# tidyverse: see https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/
cat("
library(rstudioapi)
library(tidyverse, warn.conflicts = FALSE) # ggplot2, dplyr, tidyr, readr, purrr, tibble, magrittr, readxl
library(scales) # scales for ggplot2
library(jsonlite) # json
library(lintr) # code linting
library(sf) # spatial data handling
library(rmarkdown)
library(data.table)
library(cowplot) # theme
library(extrafont)
library(waldo) # compare
library(psych) # some useful funs 
library(ggrepel) # text labels
library(skimr) # data quality
library(lsr) # book package
library(vcd)
library(sciplot)
library(gplots)
library(car) # statistics tests 
library(lmtest) # more statistics tests
library(itns) # book itns 
library(janitor)", # names
  file = "manifest.R"
)

2.2 Install packages

# if checkpoint is not yet installed, install it (for people using this
# system for the first time)
if (!require(checkpoint)) {
  if (!require(devtools)) {
    install.packages("devtools", repos = "http://cran.us.r-project.org")
    require(devtools)
  }
  devtools::install_github("RevolutionAnalytics/checkpoint",
    ref = "v0.3.2", # could be adapted later,
    # as of now (beginning of July 2017
    # this is the current release on CRAN)
    repos = "http://cran.us.r-project.org"
  )
  require(checkpoint)
}
# nolint start
if (!dir.exists("~/.checkpoint")) {
  dir.create("~/.checkpoint")
}
# nolint end
# install packages for the specified CRAN snapshot date
checkpoint(
  snapshot_date = package_date,
  project = path_to_wd,
  verbose = T,
  scanForPackages = T,
  use.knitr = F,
  R.version = r_version
)
rm(package_date)

2.3 Load packages

source("manifest.R")
unlink("manifest.R")
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] janitor_2.0.1     itns_0.1.0        lmtest_0.9-38     zoo_1.8-8        
##  [5] car_3.0-10        carData_3.0-4     gplots_3.1.1      sciplot_1.2-0    
##  [9] vcd_1.4-8         lsr_0.5           skimr_2.1.2       ggrepel_0.9.0    
## [13] psych_2.0.12      waldo_0.2.5       extrafont_0.17    cowplot_1.1.1    
## [17] data.table_1.13.6 rmarkdown_2.6     sf_0.9-6          lintr_2.0.1      
## [21] jsonlite_1.7.2    scales_1.1.1      forcats_0.5.0     stringr_1.4.0    
## [25] dplyr_1.0.2       purrr_0.3.4       readr_1.4.0       tidyr_1.1.2      
## [29] tibble_3.0.4      ggplot2_3.3.3     tidyverse_1.3.0   rstudioapi_0.13  
## [33] checkpoint_1.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-0   ellipsis_0.3.1     class_7.3-19       rio_0.5.16        
##  [5] rprojroot_2.0.2    snakecase_0.11.0   base64enc_0.1-3    fs_1.5.0          
##  [9] remotes_2.4.0      lubridate_1.7.9.2  xml2_1.3.2         mnormt_2.0.2      
## [13] knitr_1.30         broom_0.7.3        Rttf2pt1_1.3.8     dbplyr_2.0.0      
## [17] compiler_4.1.0     httr_1.4.2         backports_1.2.1    assertthat_0.2.1  
## [21] lazyeval_0.2.2     cli_2.5.0          htmltools_0.5.0    tools_4.1.0       
## [25] gtable_0.3.0       glue_1.4.2         Rcpp_1.0.5         cellranger_1.1.0  
## [29] vctrs_0.3.6        nlme_3.1-152       extrafontdb_1.0    xfun_0.19         
## [33] ps_1.5.0           openxlsx_4.2.3     rvest_0.3.6        lifecycle_1.0.0   
## [37] gtools_3.8.2       MASS_7.3-54        hms_0.5.3          rex_1.2.0         
## [41] parallel_4.1.0     yaml_2.2.1         curl_4.3           stringi_1.5.3     
## [45] desc_1.3.0         e1071_1.7-4        caTools_1.18.0     cyclocomp_1.1.0   
## [49] zip_2.2.0          repr_1.1.0         rlang_0.4.10       pkgconfig_2.0.3   
## [53] bitops_1.0-6       evaluate_0.14      lattice_0.20-44    processx_3.5.2    
## [57] tidyselect_1.1.0   magrittr_2.0.1     R6_2.5.0           generics_0.1.0    
## [61] DBI_1.1.0          pillar_1.4.7       haven_2.3.1        foreign_0.8-81    
## [65] withr_2.4.2        units_0.6-7        abind_1.4-5        modelr_0.1.8      
## [69] crayon_1.3.4       KernSmooth_2.23-20 tmvnsim_1.0-2      readxl_1.3.1      
## [73] callr_3.7.0        reprex_0.3.0       digest_0.6.27      classInt_0.4-3    
## [77] munsell_0.5.0

2.4 Load additional scripts

# if you want to outsource logic to other script files, see README for
# further information
# Load all visualizations functions as separate scripts
knitr::read_chunk("scripts/dviz.supp.R")
source("scripts/dviz.supp.R")
knitr::read_chunk("scripts/themes.R")
source("scripts/themes.R")
knitr::read_chunk("scripts/plot_grid.R")
source("scripts/plot_grid.R")
knitr::read_chunk("scripts/align_legend.R")
source("scripts/align_legend.R")
knitr::read_chunk("scripts/label_log10.R")
source("scripts/label_log10.R")

3 Programming

3.1 Loops

# the while loop
x <- 0
while (x < 1000) {
  x <- x + 17
}
x
## [1] 1003
# the for loop
for (i in 1:3) {
  print("hello")
}
## [1] "hello"
## [1] "hello"
## [1] "hello"
# a simple example
words <- c("it", "was", "the", "dirty", "end", "of", "winter")
for (w in words) {
  w.length <- nchar(w)
  W <- toupper(w)
  msg <- paste(W, "has", w.length, "letters")
  print(msg)
}
## [1] "IT has 2 letters"
## [1] "WAS has 3 letters"
## [1] "THE has 3 letters"
## [1] "DIRTY has 5 letters"
## [1] "END has 3 letters"
## [1] "OF has 2 letters"
## [1] "WINTER has 6 letters"
# a more complex example
# set up
month <- 0 # count the number of months
balance <- 300000 # initial mortgage balance
payments <- 1600 # monthly payments
interest <- 0.05 # 5% interest rate per year
total.paid <- 0 # track what you've paid the bank

# convert annual interest to a monthly multiplier
monthly.multiplier <- (1 + interest)^(1 / 12)

# keep looping until the loan is paid off...
while (balance > 0) {

  # do the calculations for this month
  month <- month + 1 # one more month
  balance <- balance * monthly.multiplier # add the interest
  balance <- balance - payments # make the payments
  total.paid <- total.paid + payments # track the total paid

  # print the results on screen
  cat("month", month, ": balance", round(balance), "\n")
} # end of loop
## month 1 : balance 299622 
## month 2 : balance 299243 
## month 3 : balance 298862 
## month 4 : balance 298480 
## month 5 : balance 298096 
## month 6 : balance 297710 
## month 7 : balance 297323 
## month 8 : balance 296934 
## month 9 : balance 296544 
## month 10 : balance 296152 
## month 11 : balance 295759 
## month 12 : balance 295364 
## month 13 : balance 294967 
## month 14 : balance 294569 
## month 15 : balance 294169 
## month 16 : balance 293768 
## month 17 : balance 293364 
## month 18 : balance 292960 
## month 19 : balance 292553 
## month 20 : balance 292145 
## month 21 : balance 291735 
## month 22 : balance 291324 
## month 23 : balance 290911 
## month 24 : balance 290496 
## month 25 : balance 290079 
## month 26 : balance 289661 
## month 27 : balance 289241 
## month 28 : balance 288820 
## month 29 : balance 288396 
## month 30 : balance 287971 
## month 31 : balance 287545 
## month 32 : balance 287116 
## month 33 : balance 286686 
## month 34 : balance 286254 
## month 35 : balance 285820 
## month 36 : balance 285385 
## month 37 : balance 284947 
## month 38 : balance 284508 
## month 39 : balance 284067 
## month 40 : balance 283625 
## month 41 : balance 283180 
## month 42 : balance 282734 
## month 43 : balance 282286 
## month 44 : balance 281836 
## month 45 : balance 281384 
## month 46 : balance 280930 
## month 47 : balance 280475 
## month 48 : balance 280018 
## month 49 : balance 279559 
## month 50 : balance 279098 
## month 51 : balance 278635 
## month 52 : balance 278170 
## month 53 : balance 277703 
## month 54 : balance 277234 
## month 55 : balance 276764 
## month 56 : balance 276292 
## month 57 : balance 275817 
## month 58 : balance 275341 
## month 59 : balance 274863 
## month 60 : balance 274382 
## month 61 : balance 273900 
## month 62 : balance 273416 
## month 63 : balance 272930 
## month 64 : balance 272442 
## month 65 : balance 271952 
## month 66 : balance 271460 
## month 67 : balance 270966 
## month 68 : balance 270470 
## month 69 : balance 269972 
## month 70 : balance 269472 
## month 71 : balance 268970 
## month 72 : balance 268465 
## month 73 : balance 267959 
## month 74 : balance 267451 
## month 75 : balance 266941 
## month 76 : balance 266428 
## month 77 : balance 265914 
## month 78 : balance 265397 
## month 79 : balance 264878 
## month 80 : balance 264357 
## month 81 : balance 263834 
## month 82 : balance 263309 
## month 83 : balance 262782 
## month 84 : balance 262253 
## month 85 : balance 261721 
## month 86 : balance 261187 
## month 87 : balance 260651 
## month 88 : balance 260113 
## month 89 : balance 259573 
## month 90 : balance 259031 
## month 91 : balance 258486 
## month 92 : balance 257939 
## month 93 : balance 257390 
## month 94 : balance 256839 
## month 95 : balance 256285 
## month 96 : balance 255729 
## month 97 : balance 255171 
## month 98 : balance 254611 
## month 99 : balance 254048 
## month 100 : balance 253483 
## month 101 : balance 252916 
## month 102 : balance 252346 
## month 103 : balance 251774 
## month 104 : balance 251200 
## month 105 : balance 250623 
## month 106 : balance 250044 
## month 107 : balance 249463 
## month 108 : balance 248879 
## month 109 : balance 248293 
## month 110 : balance 247705 
## month 111 : balance 247114 
## month 112 : balance 246521 
## month 113 : balance 245925 
## month 114 : balance 245327 
## month 115 : balance 244727 
## month 116 : balance 244124 
## month 117 : balance 243518 
## month 118 : balance 242911 
## month 119 : balance 242300 
## month 120 : balance 241687 
## month 121 : balance 241072 
## month 122 : balance 240454 
## month 123 : balance 239834 
## month 124 : balance 239211 
## month 125 : balance 238585 
## month 126 : balance 237958 
## month 127 : balance 237327 
## month 128 : balance 236694 
## month 129 : balance 236058 
## month 130 : balance 235420 
## month 131 : balance 234779 
## month 132 : balance 234136 
## month 133 : balance 233489 
## month 134 : balance 232841 
## month 135 : balance 232189 
## month 136 : balance 231535 
## month 137 : balance 230879 
## month 138 : balance 230219 
## month 139 : balance 229557 
## month 140 : balance 228892 
## month 141 : balance 228225 
## month 142 : balance 227555 
## month 143 : balance 226882 
## month 144 : balance 226206 
## month 145 : balance 225528 
## month 146 : balance 224847 
## month 147 : balance 224163 
## month 148 : balance 223476 
## month 149 : balance 222786 
## month 150 : balance 222094 
## month 151 : balance 221399 
## month 152 : balance 220701 
## month 153 : balance 220000 
## month 154 : balance 219296 
## month 155 : balance 218590 
## month 156 : balance 217880 
## month 157 : balance 217168 
## month 158 : balance 216453 
## month 159 : balance 215735 
## month 160 : balance 215014 
## month 161 : balance 214290 
## month 162 : balance 213563 
## month 163 : balance 212833 
## month 164 : balance 212100 
## month 165 : balance 211364 
## month 166 : balance 210625 
## month 167 : balance 209883 
## month 168 : balance 209138 
## month 169 : balance 208390 
## month 170 : balance 207639 
## month 171 : balance 206885 
## month 172 : balance 206128 
## month 173 : balance 205368 
## month 174 : balance 204605 
## month 175 : balance 203838 
## month 176 : balance 203069 
## month 177 : balance 202296 
## month 178 : balance 201520 
## month 179 : balance 200741 
## month 180 : balance 199959 
## month 181 : balance 199174 
## month 182 : balance 198385 
## month 183 : balance 197593 
## month 184 : balance 196798 
## month 185 : balance 196000 
## month 186 : balance 195199 
## month 187 : balance 194394 
## month 188 : balance 193586 
## month 189 : balance 192775 
## month 190 : balance 191960 
## month 191 : balance 191142 
## month 192 : balance 190321 
## month 193 : balance 189496 
## month 194 : balance 188668 
## month 195 : balance 187837 
## month 196 : balance 187002 
## month 197 : balance 186164 
## month 198 : balance 185323 
## month 199 : balance 184478 
## month 200 : balance 183629 
## month 201 : balance 182777 
## month 202 : balance 181922 
## month 203 : balance 181063 
## month 204 : balance 180201 
## month 205 : balance 179335 
## month 206 : balance 178466 
## month 207 : balance 177593 
## month 208 : balance 176716 
## month 209 : balance 175836 
## month 210 : balance 174953 
## month 211 : balance 174065 
## month 212 : balance 173175 
## month 213 : balance 172280 
## month 214 : balance 171382 
## month 215 : balance 170480 
## month 216 : balance 169575 
## month 217 : balance 168666 
## month 218 : balance 167753 
## month 219 : balance 166836 
## month 220 : balance 165916 
## month 221 : balance 164992 
## month 222 : balance 164064 
## month 223 : balance 163133 
## month 224 : balance 162197 
## month 225 : balance 161258 
## month 226 : balance 160315 
## month 227 : balance 159368 
## month 228 : balance 158417 
## month 229 : balance 157463 
## month 230 : balance 156504 
## month 231 : balance 155542 
## month 232 : balance 154576 
## month 233 : balance 153605 
## month 234 : balance 152631 
## month 235 : balance 151653 
## month 236 : balance 150671 
## month 237 : balance 149685 
## month 238 : balance 148695 
## month 239 : balance 147700 
## month 240 : balance 146702 
## month 241 : balance 145700 
## month 242 : balance 144693 
## month 243 : balance 143683 
## month 244 : balance 142668 
## month 245 : balance 141650 
## month 246 : balance 140627 
## month 247 : balance 139600 
## month 248 : balance 138568 
## month 249 : balance 137533 
## month 250 : balance 136493 
## month 251 : balance 135449 
## month 252 : balance 134401 
## month 253 : balance 133349 
## month 254 : balance 132292 
## month 255 : balance 131231 
## month 256 : balance 130166 
## month 257 : balance 129096 
## month 258 : balance 128022 
## month 259 : balance 126943 
## month 260 : balance 125861 
## month 261 : balance 124773 
## month 262 : balance 123682 
## month 263 : balance 122586 
## month 264 : balance 121485 
## month 265 : balance 120380 
## month 266 : balance 119270 
## month 267 : balance 118156 
## month 268 : balance 117038 
## month 269 : balance 115915 
## month 270 : balance 114787 
## month 271 : balance 113654 
## month 272 : balance 112518 
## month 273 : balance 111376 
## month 274 : balance 110230 
## month 275 : balance 109079 
## month 276 : balance 107923 
## month 277 : balance 106763 
## month 278 : balance 105598 
## month 279 : balance 104428 
## month 280 : balance 103254 
## month 281 : balance 102074 
## month 282 : balance 100890 
## month 283 : balance 99701 
## month 284 : balance 98507 
## month 285 : balance 97309 
## month 286 : balance 96105 
## month 287 : balance 94897 
## month 288 : balance 93683 
## month 289 : balance 92465 
## month 290 : balance 91242 
## month 291 : balance 90013 
## month 292 : balance 88780 
## month 293 : balance 87542 
## month 294 : balance 86298 
## month 295 : balance 85050 
## month 296 : balance 83797 
## month 297 : balance 82538 
## month 298 : balance 81274 
## month 299 : balance 80005 
## month 300 : balance 78731 
## month 301 : balance 77452 
## month 302 : balance 76168 
## month 303 : balance 74878 
## month 304 : balance 73583 
## month 305 : balance 72283 
## month 306 : balance 70977 
## month 307 : balance 69666 
## month 308 : balance 68350 
## month 309 : balance 67029 
## month 310 : balance 65702 
## month 311 : balance 64369 
## month 312 : balance 63032 
## month 313 : balance 61688 
## month 314 : balance 60340 
## month 315 : balance 58986 
## month 316 : balance 57626 
## month 317 : balance 56261 
## month 318 : balance 54890 
## month 319 : balance 53514 
## month 320 : balance 52132 
## month 321 : balance 50744 
## month 322 : balance 49351 
## month 323 : balance 47952 
## month 324 : balance 46547 
## month 325 : balance 45137 
## month 326 : balance 43721 
## month 327 : balance 42299 
## month 328 : balance 40871 
## month 329 : balance 39438 
## month 330 : balance 37998 
## month 331 : balance 36553 
## month 332 : balance 35102 
## month 333 : balance 33645 
## month 334 : balance 32182 
## month 335 : balance 30713 
## month 336 : balance 29238 
## month 337 : balance 27758 
## month 338 : balance 26271 
## month 339 : balance 24778 
## month 340 : balance 23279 
## month 341 : balance 21773 
## month 342 : balance 20262 
## month 343 : balance 18745 
## month 344 : balance 17221 
## month 345 : balance 15691 
## month 346 : balance 14155 
## month 347 : balance 12613 
## month 348 : balance 11064 
## month 349 : balance 9509 
## month 350 : balance 7948 
## month 351 : balance 6380 
## month 352 : balance 4806 
## month 353 : balance 3226 
## month 354 : balance 1639 
## month 355 : balance 46 
## month 356 : balance -1554
# print the total payments at the end
cat("total payments made", total.paid, "\n")
## total payments made 569600

3.2 Conditional statements

# find out what day it is...
today <- Sys.Date() # pull the date from the system clock
day <- weekdays(today) # what day of the week it is_

# now make a choice depending on the day...
if (day == "Monday") {
  print("I don't like Mondays")
} else {
  print("I'm a happy little automaton")
}
## [1] "I'm a happy little automaton"

3.3 Functions

A formal argument can be a symbol (i.e. a variable name such as x or y), a statement of the form symbol = expression (e.g. pch=16) or the special formal argument … (triple dot).

# a simple function
quadruple <- function(x) {
  y <- x * 4
  return(y)
}
quadruple(3)
## [1] 12
# another simple function with two arguments
pow <- function(x, y = 1) { # setting a default value for args
  out <- x^y # raise x to the power y
  return(out)
}
pow(3, 2)
## [1] 9
# using the ... argument
doubleMax <- function(...) {
  max.val <- max(...) # find the largest value in ...
  out <- 2 * max.val # double it
  return(out)
}
doubleMax(4, 6, 9)
## [1] 18

3.4 Implicit loops

# sapply()
words <- c("along", "the", "loom", "of", "the", "land")
sapply(words, FUN = nchar)
## along   the  loom    of   the  land 
##     5     3     4     2     3     4
# tapply()
gender <- c("male", "male", "female", "female", "male")
age <- c(10, 12, 9, 11, 13)
# loop over all values that appear in the INDEX grouping variable
tapply(age, INDEX = gender, FUN = mean)
##   female     male 
## 10.00000 11.66667
# by
by(age, INDICES = gender, FUN = mean) # does the same as tapply
## gender: female
## [1] 10
## ------------------------------------------------------------ 
## gender: male
## [1] 11.66667

4 Descriptive statistics

4.1 Measures of central tendency

# Load data
load("~/lsr/lsr/analysis/input/aflsmall.Rdata")
table(afl.finalists)
## afl.finalists
##         Adelaide         Brisbane          Carlton      Collingwood 
##               26               25               26               28 
##         Essendon          Fitzroy        Fremantle          Geelong 
##               32                0                6               39 
##         Hawthorn        Melbourne  North Melbourne    Port Adelaide 
##               27               28               28               17 
##         Richmond         St Kilda           Sydney       West Coast 
##                6               24               26               38 
## Western Bulldogs 
##               24
# calculate the mode
m <- modeOf(afl.finalists)
cat("The mode of the dataset is", m, "\n")
## The mode of the dataset is Geelong
# calculate the modal frequency
max <- maxFreq(afl.finalists)
cat("The modal frequency of the dataset is", max, "\n")
## The modal frequency of the dataset is 39

4.2 Measures of variability

# calculate quantiles
quantile(afl.margins, probs = .5)
##  50% 
## 30.5
quantile(afl.margins, probs = c(.25, .75))
##   25%   75% 
## 12.75 50.50
# calculate IQR
i <- IQR(afl.margins)
cat("IQR is", i, "\n")
## IQR is 37.75
# calculate the mean absolute deviation (AAD)
a <- aad(afl.margins[1:5])
cat("The mean absolute deviation (AAD) is", a, "\n")
## The mean absolute deviation (AAD) is 15.52
# calculate variance (also called mean square deviation)
v <- var(afl.margins[1:5])
cat("The variance is", v, "\n")
## The variance is 405.8
# calculate standard deviation (also called root mean squared deviation)
s <- sd(afl.margins)
cat("The standard deviation is", s, "\n")
## The standard deviation is 26.07364
# calculate median absolute deviation (MAD)
m <- mad(afl.margins, constant = 1)
cat("The median absolute deviation (MAD) is", m, "\n")
## The median absolute deviation (MAD) is 19.5
# calculate skewness
sk <- skew(afl.margins)
cat("The skewness is", sk, "\n")
## The skewness is 0.7671555
# calculate kurtosis
k <- kurtosi(afl.margins)
cat("The kurtosis is", k, "\n")
## The kurtosis is 0.02962633

4.3 Overall data summary

# load data
load("~/lsr/lsr/analysis/input/clinicaltrial.Rdata")

# for numeric variables
summary(afl.margins)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   12.75   30.50   35.30   50.50  116.00
# for nominal (factor) variables
summary(afl.finalists)
##         Adelaide         Brisbane          Carlton      Collingwood 
##               26               25               26               28 
##         Essendon          Fitzroy        Fremantle          Geelong 
##               32                0                6               39 
##         Hawthorn        Melbourne  North Melbourne    Port Adelaide 
##               27               28               28               17 
##         Richmond         St Kilda           Sydney       West Coast 
##                6               24               26               38 
## Western Bulldogs 
##               24
# for data frames
describe(x = clin.trial)
vars n mean sd median trimmed mad min max range skew kurtosis se
drug* 1 18 2.0000000 0.8401681 2.00 2.000 1.48260 1.0 3.0 2.0 0.0000000 -1.662037 0.1980295
therapy* 2 18 1.5000000 0.5144958 1.50 1.500 0.74130 1.0 2.0 1.0 0.0000000 -2.108025 0.1212678
mood.gain 3 18 0.8833333 0.5338539 0.85 0.875 0.66717 0.1 1.8 1.7 0.1333981 -1.439739 0.1258306
# broken down by grouping variable
describeBy(clin.trial, group = clin.trial$therapy)
## 
##  Descriptive statistics by group 
## group: no.therapy
##           vars n mean   sd median trimmed  mad min max range skew kurtosis   se
## drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0 0.00    -1.81 0.29
## therapy*     2 9 1.00 0.00    1.0    1.00 0.00 1.0 1.0   0.0  NaN      NaN 0.00
## mood.gain    3 9 0.72 0.59    0.5    0.72 0.44 0.1 1.7   1.6 0.51    -1.59 0.20
## ------------------------------------------------------------ 
## group: CBT
##           vars n mean   sd median trimmed  mad min max range  skew kurtosis
## drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0  0.00    -1.81
## therapy*     2 9 2.00 0.00    2.0    2.00 0.00 2.0 2.0   0.0   NaN      NaN
## mood.gain    3 9 1.04 0.45    1.1    1.04 0.44 0.3 1.8   1.5 -0.03    -1.12
##             se
## drug*     0.29
## therapy*  0.00
## mood.gain 0.15
by(clin.trial, INDICES = clin.trial$therapy, FUN = describe)
## clin.trial$therapy: no.therapy
##           vars n mean   sd median trimmed  mad min max range skew kurtosis   se
## drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0 0.00    -1.81 0.29
## therapy*     2 9 1.00 0.00    1.0    1.00 0.00 1.0 1.0   0.0  NaN      NaN 0.00
## mood.gain    3 9 0.72 0.59    0.5    0.72 0.44 0.1 1.7   1.6 0.51    -1.59 0.20
## ------------------------------------------------------------ 
## clin.trial$therapy: CBT
##           vars n mean   sd median trimmed  mad min max range  skew kurtosis
## drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0  0.00    -1.81
## therapy*     2 9 2.00 0.00    2.0    2.00 0.00 2.0 2.0   0.0   NaN      NaN
## mood.gain    3 9 1.04 0.45    1.1    1.04 0.44 0.3 1.8   1.5 -0.03    -1.12
##             se
## drug*     0.29
## therapy*  0.00
## mood.gain 0.15
by(clin.trial, INDICES = clin.trial$therapy, FUN = summary)
## clin.trial$therapy: no.therapy
##        drug         therapy    mood.gain     
##  placebo :3   no.therapy:9   Min.   :0.1000  
##  anxifree:3   CBT       :0   1st Qu.:0.3000  
##  joyzepam:3                  Median :0.5000  
##                              Mean   :0.7222  
##                              3rd Qu.:1.3000  
##                              Max.   :1.7000  
## ------------------------------------------------------------ 
## clin.trial$therapy: CBT
##        drug         therapy    mood.gain    
##  placebo :3   no.therapy:0   Min.   :0.300  
##  anxifree:3   CBT       :9   1st Qu.:0.800  
##  joyzepam:3                  Median :1.100  
##                              Mean   :1.044  
##                              3rd Qu.:1.300  
##                              Max.   :1.800
aggregate(mood.gain ~ drug + therapy, data = clin.trial, FUN = mean)
drug therapy mood.gain
placebo no.therapy 0.300000
anxifree no.therapy 0.400000
joyzepam no.therapy 1.466667
placebo CBT 0.600000
anxifree CBT 1.033333
joyzepam CBT 1.500000

4.4 Correlations

# load data
load("~/lsr/lsr/analysis/input/parenthood.Rdata")

hist(parenthood$dan.grump)

hist(parenthood$dan.sleep)

hist(parenthood$baby.sleep)

plot(x = parenthood$dan.sleep, y = parenthood$dan.grump)

plot(x = parenthood$baby.sleep, y = parenthood$dan.grump)

# calculate correlations (the Pearson's correlation)
cor(x = parenthood$dan.sleep, y = parenthood$dan.grump)
## [1] -0.903384
cor(parenthood)
##              dan.sleep  baby.sleep   dan.grump         day
## dan.sleep   1.00000000  0.62794934 -0.90338404 -0.09840768
## baby.sleep  0.62794934  1.00000000 -0.56596373 -0.01043394
## dan.grump  -0.90338404 -0.56596373  1.00000000  0.07647926
## day        -0.09840768 -0.01043394  0.07647926  1.00000000
# load data
load("~/lsr/lsr/analysis/input/effort.Rdata")

# calculate correlations (the Spearmans's correlation)
hours_rank <- rank(effort$hours)
grade_rank <- rank(effort$grade)
cor(hours_rank, grade_rank)
## [1] 1
# or
cor(effort$hours, effort$grade, method = "spearman")
## [1] 1
# calculate correlations getting rid of all non-numeric variables
# # load data
load("~/lsr/lsr/analysis/input/work.Rdata")

lsr::correlate(work)
## 
## CORRELATIONS
## ============
## - correlation type:  pearson 
## - correlations shown only when both variables are numeric
## 
##           hours  tasks   pay    day weekday   week day.type
## hours         .  0.800 0.760 -0.049       .  0.018        .
## tasks     0.800      . 0.720 -0.072       . -0.013        .
## pay       0.760  0.720     .  0.137       .  0.196        .
## day      -0.049 -0.072 0.137      .       .  0.990        .
## weekday       .      .     .      .       .      .        .
## week      0.018 -0.013 0.196  0.990       .      .        .
## day.type      .      .     .      .       .      .        .
lsr::correlate(work, corr.method = "spearman")
## 
## CORRELATIONS
## ============
## - correlation type:  spearman 
## - correlations shown only when both variables are numeric
## 
##           hours  tasks   pay    day weekday   week day.type
## hours         .  0.805 0.745 -0.047       .  0.010        .
## tasks     0.805      . 0.730 -0.068       . -0.008        .
## pay       0.745  0.730     .  0.094       .  0.154        .
## day      -0.047 -0.068 0.094      .       .  0.990        .
## weekday       .      .     .      .       .      .        .
## week      0.010 -0.008 0.154  0.990       .      .        .
## day.type      .      .     .      .       .      .        .
# look at the data
head(body_well)
sex bodysat wellbeing
female 4.000000 1.0
female 2.428571 2.4
female 3.000000 2.3
female 3.000000 2.4
female 3.142857 2.8
female 2.428571 3.2
dplyr::glimpse(body_well)
## Rows: 106
## Columns: 3
## $ sex       <fct> female, female, female, female, female, female, female, fem…
## $ bodysat   <dbl> 4.000000, 2.428571, 3.000000, 3.000000, 3.142857, 2.428571,…
## $ wellbeing <dbl> 1.0, 2.4, 2.3, 2.4, 2.8, 3.2, 3.4, 3.4, 3.6, 3.6, 3.8, 3.8,…
plot(x = body_well$bodysat, y = body_well$wellbeing)

ggplot(body_well, aes(x = bodysat, y = wellbeing)) +
  geom_point(shape = 21, fill = "deepskyblue4", stroke = .5, size = 3, alpha = .7) +
  geom_smooth(method = "lm", se = FALSE) +
  theme_cowplot()

# calculating Pearson's correlation coefficient
cor(body_well$bodysat, body_well$wellbeing, method = "spearman")
## [1] 0.4726243
lsr::correlate(x = body_well$bodysat, y = body_well$wellbeing, corr.method = "spearman")
## 
## CORRELATIONS
## ============
## - correlation type:  spearman 
## - correlations shown only when both variables are numeric
## 
##       y.var
## x.var 0.473
# eyeballing r correlation coefficient
ggplot(body_well, aes(x = bodysat, y = wellbeing)) +
  geom_point(shape = 21, fill = "deepskyblue4", stroke = .5, size = 3, alpha = .7) +
  geom_vline(aes(xintercept = mean(bodysat)), size = .3, linetype = "dashed", color = "red") +
  geom_hline(aes(yintercept = mean(wellbeing)), size = .3, linetype = "dashed", color = "red") +
  theme_cowplot()

# look at the data
head(thomason1)
pre post
13 14
12 13
12 16
9 12
14 15
17 18
dplyr::glimpse(thomason1)
## Rows: 12
## Columns: 2
## $ pre  <dbl> 13, 12, 12, 9, 14, 17, 14, 9, 6, 7, 11, 15
## $ post <dbl> 14, 13, 16, 12, 15, 18, 13, 10, 10, 8, 14, 16
plot(x = thomason1$pre, y = thomason1$post)

ggplot(thomason1, aes(x = pre, y = post)) +
  geom_point(shape = 21, fill = "deepskyblue4", stroke = .5, size = 3, alpha = .7) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 20), breaks = seq(0, 20, 2)) +
  scale_x_continuous(expand = c(0, 0), limits = c(0, 18), breaks = seq(0, 18, 2)) +
  geom_smooth(method = "lm", se = FALSE) +
  theme_cowplot()

# calculating Pearson's correlation coefficient
cor(thomason1$pre, thomason1$post, method = "spearman")
## [1] 0.8424792
lsr::correlate(x = thomason1$pre, y = thomason1$post, corr.method = "spearman")
## 
## CORRELATIONS
## ============
## - correlation type:  spearman 
## - correlations shown only when both variables are numeric
## 
##       y.var
## x.var 0.842
# eyeballing r correlation coefficient
ggplot(thomason1, aes(x = pre, y = post)) +
  geom_point(shape = 21, fill = "deepskyblue4", stroke = .5, size = 3, alpha = .7) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 20), breaks = seq(0, 20, 2)) +
  scale_x_continuous(expand = c(0, 0), limits = c(0, 18), breaks = seq(0, 18, 2)) +
  geom_vline(aes(xintercept = mean(pre)), size = .3, linetype = "dashed", color = "red") +
  geom_hline(aes(yintercept = mean(post)), size = .3, linetype = "dashed", color = "red") +
  theme_cowplot()

# look at the data
head(sleep_beauty)
nightly_sleep_hours rated_attractiveness
9.3 3.475840
11.5 3.813389
6.7 4.917097
9.3 5.000430
9.8 4.685868
7.9 4.249000
dplyr::glimpse(sleep_beauty)
## Rows: 70
## Columns: 2
## $ nightly_sleep_hours  <dbl> 9.3, 11.5, 6.7, 9.3, 9.8, 7.9, 8.0, 10.4, 3.7, 1…
## $ rated_attractiveness <dbl> 3.475840, 3.813389, 4.917097, 5.000430, 4.685868…
# eyeballing r correlation coefficient
ggplot(sleep_beauty, aes(x = nightly_sleep_hours, y = rated_attractiveness)) +
  geom_point(shape = 21, fill = "deepskyblue4", stroke = .5, size = 3, alpha = .7) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 8), breaks = seq(0, 8, 1)) +
  scale_x_continuous(expand = c(0, 0), limits = c(0, 14), breaks = seq(0, 14, 1)) +
  geom_vline(aes(xintercept = mean(nightly_sleep_hours)), size = .3, linetype = "dashed", color = "red") +
  geom_hline(aes(yintercept = mean(rated_attractiveness)), size = .3, linetype = "dashed", color = "red") +
  theme_cowplot()

# calculating CI's for r
cor.test(x = sleep_beauty$nightly_sleep_hours, y = sleep_beauty$rated_attractiveness)
## 
##  Pearson's product-moment correlation
## 
## data:  sleep_beauty$nightly_sleep_hours and sleep_beauty$rated_attractiveness
## t = -2.7175, df = 68, p-value = 0.008337
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.51041935 -0.08420143
## sample estimates:
##       cor 
## -0.312983

4.5 Handling missing values

# load data
load("~/lsr/lsr/analysis/input/parenthood2.Rdata")

# ignore all cases that have any missing values
cor(parenthood2, use = "complete.obs")
##              dan.sleep baby.sleep   dan.grump         day
## dan.sleep   1.00000000  0.6394985 -0.89951468  0.06132891
## baby.sleep  0.63949845  1.0000000 -0.58656066  0.14555814
## dan.grump  -0.89951468 -0.5865607  1.00000000 -0.06816586
## day         0.06132891  0.1455581 -0.06816586  1.00000000
# look at the variables when determining what to drop
cor(parenthood2, use = "pairwise.complete.obs")
##              dan.sleep  baby.sleep    dan.grump          day
## dan.sleep   1.00000000  0.61472303 -0.903442442 -0.076796665
## baby.sleep  0.61472303  1.00000000 -0.567802669  0.058309485
## dan.grump  -0.90344244 -0.56780267  1.000000000  0.005833399
## day        -0.07679667  0.05830949  0.005833399  1.000000000

4.6 Transforming or recoding a variable

# load data
load("~/lsr/lsr/analysis/input/likert.Rdata")

# make a data frame
df <- data.frame(likert.raw)
# recode the variable (+3 strong agree, 0 no opinion, -3 strong disagree)
df$likert.centred <- df$likert.raw - 4
# create a strength variable
df$opinion.strength <- abs(df$likert.centred)
# create a direction variable (all negative numbers converted to -1, all positives to 1 and zero stays 0)
df$opinion.dir <- sign(df$likert.centred)
df
likert.raw likert.centred opinion.strength opinion.dir
1 -3 3 -1
7 3 3 1
3 -1 1 -1
4 0 0 0
4 0 0 0
4 0 0 0
2 -2 2 -1
6 2 2 1
5 1 1 1
5 1 1 1
# cutting numeric variables into categories
age <- c(60, 58, 24, 26, 34, 42, 31, 30, 33, 2, 9)
age.breaks <- seq(from = 0, to = 60, by = 20)
age.labels <- c("young", "adult", "older")
age.group <- cut(x = age, breaks = age.breaks, labels = age.labels)
age.df <- data.frame(age, age.group)

4.7 Subsetting

# load data
load("~/lsr/lsr/analysis/input/nightgarden.Rdata")
itng <- data.frame(speaker, utterance)


# using split
speech.by.char <- split(utterance, speaker)
speech.by.char
## $`makka-pakka`
## [1] "pip" "pip" "onk" "onk"
## 
## $tombliboo
## [1] "ee" "oo"
## 
## $`upsy-daisy`
## [1] "pip" "pip" "onk" "onk"
# pull grouping variables out of the list and into workspace
importList(speech.by.char, ask = FALSE)

# using subset
df <- subset(itng, subset = speaker == "makka-pakka", select = utterance)

4.8 Sorting, flipping and merging data

# load data
load("~/lsr/lsr/analysis/input/nightgarden2.Rdata")

# sorting a data frame
sortFrame(garden, speaker, line)
speaker utterance line
case.4 makka-pakka pip 7
case.5 makka-pakka onk 9
case.3 tombliboo ee 5
case.1 upsy-daisy pip 1
case.2 upsy-daisy pip 2
# reverse order
sortFrame(garden, -speaker, line)
speaker utterance line
case.1 upsy-daisy pip 1
case.2 upsy-daisy pip 2
case.3 tombliboo ee 5
case.4 makka-pakka pip 7
case.5 makka-pakka onk 9

4.9 Working with text

# Shortening a string
animals <- c("cat", "dog", "kangaroo", "whale")

strtrim(animals, width = 3)
## [1] "cat" "dog" "kan" "wha"
substr(animals, start = 2, stop = 3)
## [1] "at" "og" "an" "ha"
# Paste strings together
paste("hello", "world")
## [1] "hello world"
paste0("hello", "world")
## [1] "helloworld"
paste("hello", "world", sep = ".")
## [1] "hello.world"
hw <- c("hello", "world")
ng <- c("nasty", "government")
paste(hw, ng)
## [1] "hello nasty"      "world government"
paste(hw, ng, collapse = " ")
## [1] "hello nasty world government"
# Splitting strings
monkey <- "It was the best of times. It was the blurst of times."
monkey.1 <- strsplit(monkey, split = " ", fixed = T)
monkey.2 <- strsplit(monkey, split = ".", fixed = T)

# Making transformations
old.text <- "albmino"
chartr(old = "alb", new = "chu", old.text)
## [1] "chumino"
# Matching and substituting text
beers <- c("little creatures", "sierra nevada", "coopers pale")
grep(pattern = "er", beers, fixed = T, value = T)
## [1] "sierra nevada" "coopers pale"
gsub(pattern = "a", replacement = "BLAH", beers, fixed = T)
## [1] "little creBLAHtures"    "sierrBLAH nevBLAHdBLAH" "coopers pBLAHle"
sub(pattern = "a", replacement = "BLAH", beers, fixed = T)
## [1] "little creBLAHtures" "sierrBLAH nevada"    "coopers pBLAHle"

5 Inferential statistics

5.1 Introduction to probability

# Functions to draw plots to show probability distributions

# Parameters for plots
emphCol <- rgb(0, 0, 1)
colour <- TRUE
emphColLight <- rgb(.5, .5, 1)
emphGrey <- grey(.5)

# Function to produce a styled binomial plot
binomPlot <- function(n, p, ...) {

  # probabilities of each outcome
  out <- 0:n
  prob <- dbinom(x = out, size = n, prob = p)

  # plot
  plot(
    out, prob,
    type = "h", lwd = 3, ylab = "Probability",
    frame.plot = FALSE, col = ifelse(colour, emphCol, "black"), ...
  )
}
binomPlot(40, 1 / 4, xlab = "Number of skulls observed")

# Function to plot a normal distribution
standardNormal <- function() {

  # draw the plot
  xval <- seq(-3, 3, .01)
  yval <- dnorm(xval, 0, 1)
  plot(xval, yval,
    lwd = 3, ylab = "Probability Density", xlab = "Observed Value",
    frame.plot = FALSE, col = ifelse(colour, emphCol, "black"), type = "l"
  )
}
standardNormal()

# Function to plot the different probability distributions
variateRelations <- function() {

  # generate the data
  n <- 1000
  normal.a <- rnorm(n)
  normal.b <- rnorm(n)
  normal.c <- rnorm(n)
  chi.sq.3 <- (normal.a)^2 + (normal.b)^2 + (normal.c)^2
  normal.d <- rnorm(n)
  t.3 <- normal.d / sqrt(chi.sq.3 / 3)
  chi.sq.20 <- rchisq(n, 20)
  F.3.20 <- (chi.sq.3 / 3) / (chi.sq.20 / 20)

  # histogram for the normal data
  bw <- .25
  hist(normal.a, seq(min(normal.a) - bw, max(normal.a) + bw, bw),
    freq = FALSE, xlim = c(-4, 4),
    col = ifelse(colour, emphColLight, emphGrey),
    border = "white", ylim = c(0, .45), axes = FALSE,
    xlab = "", ylab = "", main = "Simulated Normal Data",
    font.main = 1
  )
  lines(x <- seq(-4, 4, .1), dnorm(x), lwd = 3, col = "black")
  axis(1)

  # histogram for the chi square data
  bw <- .5
  hist(chi.sq.3, seq(0, max(chi.sq.3) + bw, bw),
    freq = FALSE, xlim = c(0, 16),
    col = ifelse(colour, emphColLight, emphGrey),
    border = "white", axes = FALSE, ylim = c(0, .25),
    xlab = "", ylab = "", main = "Simulated Chi-Square Data",
    font.main = 1
  )
  lines(x <- seq(0, 16, .1), dchisq(x, 3), lwd = 3, col = "black")
  axis(1)

  # histogram for the t data
  bw <- .3
  hist(t.3, seq(min(t.3) - bw, max(t.3) + bw, bw),
    freq = FALSE, xlim = c(-5, 5),
    col = ifelse(colour, emphColLight, emphGrey),
    border = "white", axes = FALSE, ylim = c(0, .4),
    xlab = "", ylab = "", main = "Simulated t Data",
    font.main = 1
  )
  lines(x <- seq(-4, 4, .1), dt(x, 3), lwd = 3, col = "black")
  axis(1)

  # histogram for the F dist data
  bw <- .2
  hist(F.3.20, seq(0, max(F.3.20) + bw, bw),
    freq = FALSE, xlim = c(0, 6),
    col = ifelse(colour, emphColLight, emphGrey),
    border = "white", axes = FALSE, ylim = c(0, .7),
    xlab = "", ylab = "", main = "Simulated F Data",
    font.main = 1
  )
  lines(x <- seq(0, 6, .01), df(x, 3, 20), lwd = 3, col = "black")
  axis(1)
}
variateRelations()

# Plotting the sampling distribution of the mean with different sample sizes
samplingDistributions <- function() {

  # Plots histograms of IQ samples and sampling distributions

  plotSamples <- function(n, N) {
    IQ <- rnorm(n, 100, 15 / sqrt(N))
    hist(IQ,
      breaks = seq(10, 180, 5), border = "white", freq = FALSE,
      col = ifelse(colour, emphColLight, emphGrey),
      xlab = "IQ Score", ylab = "", xlim = c(60, 140),
      main = paste("Sample Size =", N), axes = FALSE,
      font.main = 1, ylim = c(0, .07)
    )
    axis(1)
  }

  # population distribution
  x <- 60:140
  y <- dnorm(x, 100, 15)

  # plot two different sample sizes
  plotSamples(10000, 1)
  lines(x, y, lwd = 2, col = "black", type = "l")

  # plot two different sample sizes
  plotSamples(1000, 2)
  lines(x, y, lwd = 2, col = "black", type = "l")

  # plot two different sample sizes
  plotSamples(1000, 10)
  lines(x, y, lwd = 2, col = "black", type = "l")
}
samplingDistributions()

5.2 Confidence interval (CI)

# computing the 2.5th and the 97.5th percentiles (95% CI) of the normal distribution
qnorm(p = c(.025, .975))
## [1] -1.959964  1.959964
# the 15th and 85th (70% CI)
qnorm(p = c(.15, .85))
## [1] -1.036433  1.036433
# calculating quantiles of the t distribution (95% CI)
qt(p = .975, df = 1000 - 1) # large sample
## [1] 1.962341
qt(p = .975, df = 10 - 1) # small sample
## [1] 2.262157
# calculating CI
# load data
load("~/lsr/lsr/analysis/input/afl24.Rdata")
ciMean(x = afl$attendance) # 95% CI (default)
##          2.5%    97.5%
## [1,] 31597.32 32593.12
ciMean(x = afl$attendance, conf = .99) # 99% CI (default)
##          0.5%    99.5%
## [1,] 31440.77 32749.68
# plotting CI
# using bargraph
bargraph.CI(
  x.factor = year, # grouping variable
  response = attendance, # outcome variable
  data = afl, # data
  ci.fun = ciMean, # name of the function to calculate CIs
  xlab = "Year",
  ylab = "Average Attendance"
)

# using lineplot
lineplot.CI(
  x.factor = year, # grouping variable
  response = attendance, # outcome variable
  data = afl, # data
  ci.fun = ciMean, # name of the function to calculate CIs
  xlab = "Year",
  ylab = "Average Attendance"
)

# using plotmeans
plotmeans(
  formula = attendance ~ year,
  data = afl,
  n.label = FALSE
)

5.3 Categorical Data Analysis

# load data
load("input/randomness.Rdata")
load("input/chapek9.Rdata")
load("input/salem.Rdata")
load("input/agpp.Rdata")

#---------- The chi-squared goodnest-of-fit test ---------- 

observed <- table(cards$choice_1)
observed
## 
##    clubs diamonds   hearts   spades 
##       35       51       64       50
# R mathematical version
probabilities <- c(clubs = .25, diamonds = .25, hearts = .25, spaces = .25)
N <- 200
expected <- N * probabilities # expected frequencies
expected
##    clubs diamonds   hearts   spaces 
##       50       50       50       50
# difference between null hypothesis expected and actual values
observed - expected
## 
##    clubs diamonds   hearts   spades 
##      -15        1       14        0
# squared that difference to get a collection of numbers that are big whenever the
# null hypothesis makes a bad prediction
(observed - expected)^2
## 
##    clubs diamonds   hearts   spades 
##      225        1      196        0
# getting different error scores
(observed - expected)^2 / expected
## 
##    clubs diamonds   hearts   spades 
##     4.50     0.02     3.92     0.00
# getting the goodness of fit statistic (X^2 or GOF)
sum((observed - expected)^2 / expected)
## [1] 8.44
# calculating the 95th percentile of a chi-squared distribution with k - 1 degrees of freedom
qchisq(p = .95, df = 3)
## [1] 7.814728
# getting the p-value
pchisq(q = 8.44, df = 3, lower.tail = F)
## [1] 0.03774185
# or
1 - pchisq(q = 8.44, df = 3)
## [1] 0.03774185
# all above calculations all at once
goodnessOfFitTest(cards$choice_1)
## 
##      Chi-square test against specified probabilities
## 
## Data variable:   cards$choice_1 
## 
## Hypotheses: 
##    null:        true probabilities are as specified
##    alternative: true probabilities differ from those specified
## 
## Descriptives: 
##          observed freq. expected freq. specified prob.
## clubs                35             50            0.25
## diamonds             51             50            0.25
## hearts               64             50            0.25
## spades               50             50            0.25
## 
## Test results: 
##    X-squared statistic:  8.44 
##    degrees of freedom:  3 
##    p-value:  0.038
# specifying a different null hypothesis
nullProbs <- c(clubs = .2, diamonds = .3, hearts = .3, spades = .2)
goodnessOfFitTest(cards$choice_1, p = nullProbs)
## 
##      Chi-square test against specified probabilities
## 
## Data variable:   cards$choice_1 
## 
## Hypotheses: 
##    null:        true probabilities are as specified
##    alternative: true probabilities differ from those specified
## 
## Descriptives: 
##          observed freq. expected freq. specified prob.
## clubs                35             40             0.2
## diamonds             51             60             0.3
## hearts               64             60             0.3
## spades               50             40             0.2
## 
## Test results: 
##    X-squared statistic:  4.742 
##    degrees of freedom:  3 
##    p-value:  0.192
# using chisq.test() for the goodness of fit test -> frequency table (observed)
chisq.test(observed)
## 
##  Chi-squared test for given probabilities
## 
## data:  observed
## X-squared = 8.44, df = 3, p-value = 0.03774
chisq.test(x = observed, p = c(clubs = .2, diamonds = .3, hearts = .3, spades = .2))
## 
##  Chi-squared test for given probabilities
## 
## data:  observed
## X-squared = 4.7417, df = 3, p-value = 0.1917
#---------- The chi-squared test of independence (or association) test ---------- 

# producing a contingency table
chapekFrequencies <- xtabs(~ choice + species, data = chapek9)
chapekFrequencies
##         species
## choice   robot human
##   puppy     13    15
##   flower    30    13
##   data      44    65
chapekFrequenciesSum <- addmargins(chapekFrequencies, margin = c(1, 2), FUN = sum, quiet = T)

# running the test
associationTest(formula = ~ choice + species, data = chapek9)
## 
##      Chi-square test of categorical association
## 
## Variables:   choice, species 
## 
## Hypotheses: 
##    null:        variables are independent of one another
##    alternative: some contingency exists between variables
## 
## Observed contingency table:
##         species
## choice   robot human
##   puppy     13    15
##   flower    30    13
##   data      44    65
## 
## Expected contingency table under the null hypothesis:
##         species
## choice   robot human
##   puppy   13.5  14.5
##   flower  20.8  22.2
##   data    52.7  56.3
## 
## Test results: 
##    X-squared statistic:  10.722 
##    degrees of freedom:  2 
##    p-value:  0.005 
## 
## Other information: 
##    estimated effect size (Cramer's v):  0.244
# using chisq.test() for a test of independence (assocition) -> cross-tabulation table (chapekFrequencies)
chisq.test(chapekFrequencies)
## 
##  Pearson's Chi-squared test
## 
## data:  chapekFrequencies
## X-squared = 10.722, df = 2, p-value = 0.004697
#---------- The Fisher exact test ----------

salem.tabs <- table(trial)
salem.tabs
##        on.fire
## happy   FALSE TRUE
##   FALSE     3    3
##   TRUE     10    0
# running the test
fisher.test(salem.tabs)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  salem.tabs
## p-value = 0.03571
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.000000 1.202913
## sample estimates:
## odds ratio 
##          0
#---------- The McNemar test ----------

right.table <- xtabs(~ response_before + response_after, data = agpp)
right.table
##                response_after
## response_before no yes
##             no  65   5
##             yes 25   5
# running the test
mcnemar.test(right.table)
## 
##  McNemar's Chi-squared test with continuity correction
## 
## data:  right.table
## McNemar's chi-squared = 12.033, df = 1, p-value = 0.0005226
cardChoices <- xtabs(~ choice_1 + choice_2, data = cards)
cardChoices
##           choice_2
## choice_1   clubs diamonds hearts spades
##   clubs       10        9     10      6
##   diamonds    20        4     13     14
##   hearts      20       18      3     23
##   spades      18       13     15      4
# testing if there is a relation between rows and columns
chisq.test(cardChoices)
## 
##  Pearson's Chi-squared test
## 
## data:  cardChoices
## X-squared = 29.237, df = 9, p-value = 0.0005909
# testing if the row totals (the frequencies for choice_1) are different from the column totals (the frequencies for choice_2)
mcnemar.test(cardChoices)
## 
##  McNemar's Chi-squared test
## 
## data:  cardChoices
## McNemar's chi-squared = 16.033, df = 6, p-value = 0.01358

5.4 Comparing two means

5.4.1 The one-sample z-test

# Load data
load("~/lsr/lsr/analysis/input/zeppo.Rdata")
grades
##  [1] 50 60 60 64 66 66 67 69 70 74 76 76 77 79 79 79 81 82 82 89
# Calculating z-test
# 1. sample mean
sample.mean <- mean(grades)

# 2. value of the population mean null hypothesis specifies
mu.null <- 67.5

# 3. assumed population standard deviation
sd.true <- 9.5

# 4. sample size
N <- length(grades)

# 5. calculate the true standard error of the mean
sem.true <- sd.true / sqrt(N)

# 6. calculate z-score
z.score <- (sample.mean - mu.null) / sem.true
z.score
## [1] 2.259606
# 7. calculate critical regions (area under the curve)
# upper tail
upper.area <- pnorm(q = z.score, lower.tail = F)
upper.area
## [1] 0.01192287
# lower tail
lower.area <- pnorm(q = -z.score, lower.tail = T)
lower.area
## [1] 0.01192287
# 8. calculate p-value
p.value <- lower.area + upper.area
p.value
## [1] 0.02384574

5.4.2 The one-sample t-test

# running one-sample t-test (two sided test)
oneSampleTTest(x = grades, mu = 67.5)
## 
##    One sample t-test 
## 
## Data variable:   grades 
## 
## Descriptive statistics: 
##             grades
##    mean     72.300
##    std dev.  9.521
## 
## Hypotheses: 
##    null:        population mean equals 67.5 
##    alternative: population mean not equal to 67.5 
## 
## Test results: 
##    t-statistic:  2.255 
##    degrees of freedom:  19 
##    p-value:  0.036 
## 
## Other information: 
##    two-sided 95% confidence interval:  [67.844, 76.756] 
##    estimated effect size (Cohen's d):  0.504
# one sided test
oneSampleTTest(x = grades, mu = 67.5, one.sided = "greater")
## 
##    One sample t-test 
## 
## Data variable:   grades 
## 
## Descriptive statistics: 
##             grades
##    mean     72.300
##    std dev.  9.521
## 
## Hypotheses: 
##    null:        population mean less than or equal to 67.5 
##    alternative: population mean greater than 67.5 
## 
## Test results: 
##    t-statistic:  2.255 
##    degrees of freedom:  19 
##    p-value:  0.018 
## 
## Other information: 
##    one-sided 95% confidence interval:  [68.619, Inf] 
##    estimated effect size (Cohen's d):  0.504
# or
oneSampleTTest(x = grades, mu = 67.5, one.sided = "less")
## 
##    One sample t-test 
## 
## Data variable:   grades 
## 
## Descriptive statistics: 
##             grades
##    mean     72.300
##    std dev.  9.521
## 
## Hypotheses: 
##    null:        population mean greater than or equal to 67.5 
##    alternative: population mean less than 67.5 
## 
## Test results: 
##    t-statistic:  2.255 
##    degrees of freedom:  19 
##    p-value:  0.982 
## 
## Other information: 
##    one-sided 95% confidence interval:  [-Inf, 75.981] 
##    estimated effect size (Cohen's d):  0.504
# using t-test function
t.test(x = grades, mu = 67.5)
## 
##  One Sample t-test
## 
## data:  grades
## t = 2.2547, df = 19, p-value = 0.03615
## alternative hypothesis: true mean is not equal to 67.5
## 95 percent confidence interval:
##  67.84422 76.75578
## sample estimates:
## mean of x 
##      72.3

5.4.3 The independent samples t-test

# Load the data
load("~/lsr/lsr/analysis/input/harpo.Rdata")
harpo
grade tutor
65 Anastasia
72 Bernadette
66 Bernadette
74 Anastasia
73 Anastasia
71 Bernadette
66 Bernadette
76 Bernadette
69 Bernadette
79 Bernadette
73 Bernadette
62 Bernadette
83 Anastasia
76 Anastasia
69 Bernadette
68 Bernadette
65 Anastasia
86 Anastasia
60 Bernadette
70 Anastasia
80 Anastasia
73 Bernadette
68 Bernadette
55 Anastasia
67 Bernadette
78 Anastasia
78 Anastasia
90 Anastasia
77 Anastasia
74 Bernadette
56 Bernadette
68 Anastasia
74 Bernadette
# Parameters for plots
emphCol <- rgb(0, 0, 1)
emphColLight <- rgb(.5, .5, 1)
emphGrey <- grey(.5)
colour <- TRUE

# Plotting histograms
harpoHist <- function() {
  plotHist <- function(x, ...) {
    hist(x,
      border = "white",
      col = ifelse(colour, emphColLight, emphGrey), ...
    )
    axis(1)
  }

  # Anastasia
  plotHist(harpo$grade[harpo$tutor == "Anastasia"],
    xlim = c(50, 100), xlab = "Grade", main = "Anastasia's students",
    font.main = 1, breaks = seq(50, 100, 5), ylim = c(0, 7)
  )

  # Bernadette
  plotHist(harpo$grade[harpo$tutor == "Bernadette"],
    xlim = c(50, 100), xlab = "Grade", main = "Bernadette's students",
    font.main = 1, breaks = seq(50, 100, 5), ylim = c(0, 7)
  )
}

harpoHist()

# Plotting means and CIs
harpoMeans <- function() {
  plotmeans(
    formula = grade ~ tutor, # plot grade by test time
    data = harpo, # data frame
    n.label = FALSE, # don't show sample size
    xlab = "Class", # x-axis label
    ylab = "Grade" # y-axis label
  )

  # now add the line...
  abline(
    a = 0, # line has an intercept at 0
    b = 1 # and a slope of 1
  )
}

harpoMeans()

# running the Student's independent samples t-test (two sided test)
independentSamplesTTest(formula = grade ~ tutor, data = harpo, var.equal = T)
## 
##    Student's independent samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  tutor 
## 
## Descriptive statistics: 
##             Anastasia Bernadette
##    mean        74.533     69.056
##    std dev.     8.999      5.775
## 
## Hypotheses: 
##    null:        population means equal for both groups
##    alternative: different population means in each group
## 
## Test results: 
##    t-statistic:  2.115 
##    degrees of freedom:  31 
##    p-value:  0.043 
## 
## Other information: 
##    two-sided 95% confidence interval:  [0.197, 10.759] 
##    estimated effect size (Cohen's d):  0.74
# running the Welch's independent samples t-test (two sided test)
independentSamplesTTest(formula = grade ~ tutor, data = harpo) # not equal variances assumed
## 
##    Welch's independent samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  tutor 
## 
## Descriptive statistics: 
##             Anastasia Bernadette
##    mean        74.533     69.056
##    std dev.     8.999      5.775
## 
## Hypotheses: 
##    null:        population means equal for both groups
##    alternative: different population means in each group
## 
## Test results: 
##    t-statistic:  2.034 
##    degrees of freedom:  23.025 
##    p-value:  0.054 
## 
## Other information: 
##    two-sided 95% confidence interval:  [-0.092, 11.048] 
##    estimated effect size (Cohen's d):  0.724
# using t.test function (Welch's test)
t.test(formula = grade ~ tutor, data = harpo)
## 
##  Welch Two Sample t-test
## 
## data:  grade by tutor
## t = 2.0342, df = 23.025, p-value = 0.05361
## alternative hypothesis: true difference in means between group Anastasia and group Bernadette is not equal to 0
## 95 percent confidence interval:
##  -0.09249349 11.04804904
## sample estimates:
##  mean in group Anastasia mean in group Bernadette 
##                 74.53333                 69.05556
# Student's test
t.test(formula = grade ~ tutor, data = harpo, var.equal = T)
## 
##  Two Sample t-test
## 
## data:  grade by tutor
## t = 2.1154, df = 31, p-value = 0.04253
## alternative hypothesis: true difference in means between group Anastasia and group Bernadette is not equal to 0
## 95 percent confidence interval:
##   0.1965873 10.7589683
## sample estimates:
##  mean in group Anastasia mean in group Bernadette 
##                 74.53333                 69.05556
# one sided test
independentSamplesTTest(formula = grade ~ tutor, data = harpo, one.sided = "Anastasia")
## 
##    Welch's independent samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  tutor 
## 
## Descriptive statistics: 
##             Anastasia Bernadette
##    mean        74.533     69.056
##    std dev.     8.999      5.775
## 
## Hypotheses: 
##    null:        population means are equal, or smaller for group 'Anastasia' 
##    alternative: population mean is larger for group 'Anastasia' 
## 
## Test results: 
##    t-statistic:  2.034 
##    degrees of freedom:  23.025 
##    p-value:  0.027 
## 
## Other information: 
##    one-sided 95% confidence interval:  [0.863, Inf] 
##    estimated effect size (Cohen's d):  0.724
# or
independentSamplesTTest(formula = grade ~ tutor, data = harpo, one.sided = "Bernadette")
## 
##    Welch's independent samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  tutor 
## 
## Descriptive statistics: 
##             Anastasia Bernadette
##    mean        74.533     69.056
##    std dev.     8.999      5.775
## 
## Hypotheses: 
##    null:        population means are equal, or smaller for group 'Bernadette' 
##    alternative: population mean is larger for group 'Bernadette' 
## 
## Test results: 
##    t-statistic:  2.034 
##    degrees of freedom:  23.025 
##    p-value:  0.973 
## 
## Other information: 
##    one-sided 95% confidence interval:  [-Inf, 10.093] 
##    estimated effect size (Cohen's d):  0.724

5.4.4 The paired-samples t-test

# load data
load("~/lsr/lsr/analysis/input/chico.Rdata")
chico
id grade_test1 grade_test2
student1 42.9 44.6
student2 51.8 54.0
student3 71.7 72.3
student4 51.6 53.4
student5 63.5 63.8
student6 58.0 59.3
student7 59.8 60.8
student8 50.8 51.6
student9 62.5 64.3
student10 61.9 63.2
student11 50.4 51.8
student12 52.6 52.2
student13 63.0 63.0
student14 58.3 60.5
student15 53.3 57.1
student16 58.7 60.1
student17 50.1 51.7
student18 64.2 65.6
student19 57.4 58.3
student20 57.1 60.1
describe(chico)
vars n mean sd median trimmed mad min max range skew kurtosis se
id* 1 20 10.500 5.916080 10.5 10.50000 7.41300 1.0 20.0 19.0 0.0000000 -1.3809286 1.322876
grade_test1 2 20 56.980 6.616137 57.7 56.91875 7.70952 42.9 71.7 28.8 0.0540357 -0.3549756 1.479413
grade_test2 3 20 58.385 6.405612 59.7 58.35000 6.44931 44.6 72.3 27.7 -0.0538576 -0.3892926 1.432338
# create a "difference" variable
chico$improvement <- chico$grade_test2 - chico$grade_test1
chico
id grade_test1 grade_test2 improvement
student1 42.9 44.6 1.7
student2 51.8 54.0 2.2
student3 71.7 72.3 0.6
student4 51.6 53.4 1.8
student5 63.5 63.8 0.3
student6 58.0 59.3 1.3
student7 59.8 60.8 1.0
student8 50.8 51.6 0.8
student9 62.5 64.3 1.8
student10 61.9 63.2 1.3
student11 50.4 51.8 1.4
student12 52.6 52.2 -0.4
student13 63.0 63.0 0.0
student14 58.3 60.5 2.2
student15 53.3 57.1 3.8
student16 58.7 60.1 1.4
student17 50.1 51.7 1.6
student18 64.2 65.6 1.4
student19 57.4 58.3 0.9
student20 57.1 60.1 3.0
# running a paired samples t-test
#
# Option 1:
# a. create a "difference" variable (chico$improvement)
# b. run a one sample t-test on it
oneSampleTTest(chico$improvement, mu = 0)
## 
##    One sample t-test 
## 
## Data variable:   chico$improvement 
## 
## Descriptive statistics: 
##             improvement
##    mean           1.405
##    std dev.       0.970
## 
## Hypotheses: 
##    null:        population mean equals 0 
##    alternative: population mean not equal to 0 
## 
## Test results: 
##    t-statistic:  6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    two-sided 95% confidence interval:  [0.951, 1.859] 
##    estimated effect size (Cohen's d):  1.448
# Option 2:
# using the pairedSamplesTTest function from lsr package (one sided formula) (two sided test)
pairedSamplesTTest(formula = ~ grade_test2 + grade_test1, data = chico)
## 
##    Paired samples t-test 
## 
## Variables:  grade_test2 , grade_test1 
## 
## Descriptive statistics: 
##             grade_test2 grade_test1 difference
##    mean          58.385      56.980      1.405
##    std dev.       6.406       6.616      0.970
## 
## Hypotheses: 
##    null:        population means equal for both measurements
##    alternative: different population means for each measurement
## 
## Test results: 
##    t-statistic:  6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    two-sided 95% confidence interval:  [0.951, 1.859] 
##    estimated effect size (Cohen's d):  1.448
#
# one sided test
pairedSamplesTTest(formula = ~ grade_test2 + grade_test1, data = chico, one.sided = "grade_test2")
## 
##    Paired samples t-test 
## 
## Variables:  grade_test2 , grade_test1 
## 
## Descriptive statistics: 
##             grade_test2 grade_test1 difference
##    mean          58.385      56.980      1.405
##    std dev.       6.406       6.616      0.970
## 
## Hypotheses: 
##    null:        population means are equal, or smaller for measurement 'grade_test2' 
##    alternative: population mean is larger for measurement 'grade_test2' 
## 
## Test results: 
##    t-statistic:  6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    one-sided 95% confidence interval:  [1.03, Inf] 
##    estimated effect size (Cohen's d):  1.448
# running a paired samples t-test from long form data
# first, format df from wide to long
chico2 <- wideToLong(chico, within = "time")
chico2
id improvement time grade
student1 1.7 test1 42.9
student2 2.2 test1 51.8
student3 0.6 test1 71.7
student4 1.8 test1 51.6
student5 0.3 test1 63.5
student6 1.3 test1 58.0
student7 1.0 test1 59.8
student8 0.8 test1 50.8
student9 1.8 test1 62.5
student10 1.3 test1 61.9
student11 1.4 test1 50.4
student12 -0.4 test1 52.6
student13 0.0 test1 63.0
student14 2.2 test1 58.3
student15 3.8 test1 53.3
student16 1.4 test1 58.7
student17 1.6 test1 50.1
student18 1.4 test1 64.2
student19 0.9 test1 57.4
student20 3.0 test1 57.1
student1 1.7 test2 44.6
student2 2.2 test2 54.0
student3 0.6 test2 72.3
student4 1.8 test2 53.4
student5 0.3 test2 63.8
student6 1.3 test2 59.3
student7 1.0 test2 60.8
student8 0.8 test2 51.6
student9 1.8 test2 64.3
student10 1.3 test2 63.2
student11 1.4 test2 51.8
student12 -0.4 test2 52.2
student13 0.0 test2 63.0
student14 2.2 test2 60.5
student15 3.8 test2 57.1
student16 1.4 test2 60.1
student17 1.6 test2 51.7
student18 1.4 test2 65.6
student19 0.9 test2 58.3
student20 3.0 test2 60.1
# second, using the pairedSamplesTTest (two sided formula)
# Option 1 (two sided test)
pairedSamplesTTest(formula = grade ~ time, data = chico2, id = "id")
## 
##    Paired samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  time 
## ID variable:        id 
## 
## Descriptive statistics: 
##              test1  test2 difference
##    mean     56.980 58.385     -1.405
##    std dev.  6.616  6.406      0.970
## 
## Hypotheses: 
##    null:        population means equal for both measurements
##    alternative: different population means for each measurement
## 
## Test results: 
##    t-statistic:  -6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    two-sided 95% confidence interval:  [-1.859, -0.951] 
##    estimated effect size (Cohen's d):  1.448
# one sided test
pairedSamplesTTest(formula = grade ~ time, data = chico2, id = "id", one.sided = "test2")
## 
##    Paired samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  time 
## ID variable:        id 
## 
## Descriptive statistics: 
##              test1  test2 difference
##    mean     56.980 58.385     -1.405
##    std dev.  6.616  6.406      0.970
## 
## Hypotheses: 
##    null:        population means are equal, or smaller for measurement 'test2' 
##    alternative: population mean is larger for measurement 'test2' 
## 
## Test results: 
##    t-statistic:  -6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    one-sided 95% confidence interval:  [-Inf, -1.03] 
##    estimated effect size (Cohen's d):  1.448
# Option 2
pairedSamplesTTest(formula = grade ~ time + (id), data = chico2)
## 
##    Paired samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  time 
## ID variable:        id 
## 
## Descriptive statistics: 
##              test1  test2 difference
##    mean     56.980 58.385     -1.405
##    std dev.  6.616  6.406      0.970
## 
## Hypotheses: 
##    null:        population means equal for both measurements
##    alternative: different population means for each measurement
## 
## Test results: 
##    t-statistic:  -6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    two-sided 95% confidence interval:  [-1.859, -0.951] 
##    estimated effect size (Cohen's d):  1.448
# using t.test function
t.test(x = chico$grade_test2, y = chico$grade_test1, paired = T) # the first element of x and y must correspond to the same person
## 
##  Paired t-test
## 
## data:  chico$grade_test2 and chico$grade_test1
## t = 6.4754, df = 19, p-value = 0.000003321
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.9508686 1.8591314
## sample estimates:
## mean of the differences 
##                   1.405
# one sided test
pairedSamplesTTest(formula = grade ~ time + (id), data = chico2, one.sided = "test2")
## 
##    Paired samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  time 
## ID variable:        id 
## 
## Descriptive statistics: 
##              test1  test2 difference
##    mean     56.980 58.385     -1.405
##    std dev.  6.616  6.406      0.970
## 
## Hypotheses: 
##    null:        population means are equal, or smaller for measurement 'test2' 
##    alternative: population mean is larger for measurement 'test2' 
## 
## Test results: 
##    t-statistic:  -6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    one-sided 95% confidence interval:  [-Inf, -1.03] 
##    estimated effect size (Cohen's d):  1.448

5.4.5 Effect size

# Cohen's d from one sample
cohensD(x = grades, mu = 67.5)
## [1] 0.5041691
# Cohen's d from a Student t-test
cohensD(formula = grade ~ tutor, data = harpo, method = "pooled")
## [1] 0.7395614
# Cohen's d from a Welch t-test
cohensD(formula = grade ~ tutor, data = harpo, method = "unequal")
## [1] 0.7244995

5.4.6 Checking the normality of a sample

# generating data
normal.data <- rnorm(n = 100)

# plotting data
hist(x = normal.data)

qqnorm(y = normal.data)

# running a Whapiro-Wilk test
shapiro.test(x = normal.data)
## 
##  Shapiro-Wilk normality test
## 
## data:  normal.data
## W = 0.99108, p-value = 0.7516

5.4.7 Testing non-normal data with Wilcoxon tests

# load data
load("~/lsr/lsr/analysis/input/awesome.Rdata")
load("~/lsr/lsr/analysis/input/awesome2.Rdata")
load("~/lsr/lsr/analysis/input/happy.Rdata")
awesome
scores group
6.4 A
10.7 A
11.9 A
7.3 A
10.0 A
14.5 B
10.4 B
12.9 B
11.7 B
13.0 B
score.A
## [1]  6.4 10.7 11.9  7.3 10.0
score.B
## [1] 14.5 10.4 12.9 11.7 13.0
happiness
before after change
30 6 -24
43 29 -14
21 11 -10
24 31 7
23 17 -6
40 2 -38
29 31 2
56 21 -35
38 8 -30
16 21 5
# running the two sample Wilcoxon test
wilcox.test(formula = scores ~ group, data = awesome)
## 
##  Wilcoxon rank sum exact test
## 
## data:  scores by group
## W = 3, p-value = 0.05556
## alternative hypothesis: true location shift is not equal to 0
# for data stored separately for each group
wilcox.test(x = score.A, y = score.B)
## 
##  Wilcoxon rank sum exact test
## 
## data:  score.A and score.B
## W = 3, p-value = 0.05556
## alternative hypothesis: true location shift is not equal to 0
# running the one sample Wilcoxon test
wilcox.test(x = happiness$change, mu = 0)
## 
##  Wilcoxon signed rank exact test
## 
## data:  happiness$change
## V = 7, p-value = 0.03711
## alternative hypothesis: true location is not equal to 0
# or
wilcox.test(x = happiness$after, y = happiness$before, paired = T)
## 
##  Wilcoxon signed rank exact test
## 
## data:  happiness$after and happiness$before
## V = 7, p-value = 0.03711
## alternative hypothesis: true location shift is not equal to 0

5.5 Comparing several means (one-way ANOVA)

# load data
load("~/lsr/lsr/analysis/input/clinicaltrial.Rdata")
clin.trial
drug therapy mood.gain
placebo no.therapy 0.5
placebo no.therapy 0.3
placebo no.therapy 0.1
anxifree no.therapy 0.6
anxifree no.therapy 0.4
anxifree no.therapy 0.2
joyzepam no.therapy 1.4
joyzepam no.therapy 1.7
joyzepam no.therapy 1.3
placebo CBT 0.6
placebo CBT 0.9
placebo CBT 0.3
anxifree CBT 1.1
anxifree CBT 0.8
anxifree CBT 1.2
joyzepam CBT 1.8
joyzepam CBT 1.3
joyzepam CBT 1.4
# descriptive statistics
# how many people in each group
xtabs(~drug, clin.trial)
## drug
##  placebo anxifree joyzepam 
##        6        6        6
# calculating means and sd
mood.gain.mean <- aggregate(mood.gain ~ drug, clin.trial, mean) |>
rename(mood.gain.mean = mood.gain)

mood.gain.sd <- aggregate(mood.gain ~ drug, clin.trial, sd) |>
rename(mood.gain.sd = mood.gain)

mood.gain.mean
drug mood.gain.mean
placebo 0.4500000
anxifree 0.7166667
joyzepam 1.4833333
mood.gain.sd
drug mood.gain.sd
placebo 0.2810694
anxifree 0.3920034
joyzepam 0.2136976
# plotting the results
plotmeans(
  formula = mood.gain ~ drug,
  data = clin.trial,
  xlab = "Drug Administered",
  ylab = "Mood Gain",
  n.label = F
)

# calculating the within-group sum of squares
outcome <- clin.trial$mood.gain
group <- clin.trial$drug
gp.means <- tapply(outcome, group, mean)
gp.means <- gp.means[group]
dev.from.gp.means <- outcome - gp.means
squared.devs <- dev.from.gp.means^2
Y <- data.frame(group, outcome, gp.means, dev.from.gp.means, squared.devs)
Y
group outcome gp.means dev.from.gp.means squared.devs
placebo 0.5 0.4500000 0.0500000 0.0025000
placebo 0.3 0.4500000 -0.1500000 0.0225000
placebo 0.1 0.4500000 -0.3500000 0.1225000
anxifree 0.6 0.7166667 -0.1166667 0.0136111
anxifree 0.4 0.7166667 -0.3166667 0.1002778
anxifree 0.2 0.7166667 -0.5166667 0.2669444
joyzepam 1.4 1.4833333 -0.0833333 0.0069444
joyzepam 1.7 1.4833333 0.2166667 0.0469444
joyzepam 1.3 1.4833333 -0.1833333 0.0336111
placebo 0.6 0.4500000 0.1500000 0.0225000
placebo 0.9 0.4500000 0.4500000 0.2025000
placebo 0.3 0.4500000 -0.1500000 0.0225000
anxifree 1.1 0.7166667 0.3833333 0.1469444
anxifree 0.8 0.7166667 0.0833333 0.0069444
anxifree 1.2 0.7166667 0.4833333 0.2336111
joyzepam 1.8 1.4833333 0.3166667 0.1002778
joyzepam 1.3 1.4833333 -0.1833333 0.0336111
joyzepam 1.4 1.4833333 -0.0833333 0.0069444
SSw <- sum(squared.devs)
SSw
## [1] 1.391667
# calculating the between-group sum of squares
gp.means <- tapply(outcome, group, mean)
grand.mean <- mean(outcome)
dev.from.grand.mean <- gp.means - grand.mean
squared.devs <- dev.from.grand.mean^2
gp.sizes <- tapply(outcome, group, length)
wt.squared.devs <- gp.sizes * squared.devs
Y <- data.frame(gp.means, grand.mean, dev.from.grand.mean, squared.devs, gp.sizes, wt.squared.devs)
Y
gp.means grand.mean dev.from.grand.mean squared.devs gp.sizes wt.squared.devs
placebo 0.4500000 0.8833333 -0.4333333 0.1877778 6 1.1266667
anxifree 0.7166667 0.8833333 -0.1666667 0.0277778 6 0.1666667
joyzepam 1.4833333 0.8833333 0.6000000 0.3600000 6 2.1600000
SSb <- sum(wt.squared.devs)
SSb
## [1] 3.453333
# calculating df (G = 3, N = 18)
df_b <- 3 - 1
df_w <- 18 - 3

# calculating the mean square values
MS_b <- 3.45 / df_b
MS_w <- 1.39 / df_w

# calculating F-value
F <- MS_b / MS_w
F
## [1] 18.61511
# calculating p-value
pf(F, df1 = 2, df2 = 15, lower.tail = F)
## [1] 0.9999136
# calculating ANOVA with aov()
my.anova <- aov(mood.gain ~ drug, clin.trial)
my.anova
## Call:
##    aov(formula = mood.gain ~ drug, data = clin.trial)
## 
## Terms:
##                     drug Residuals
## Sum of Squares  3.453333  1.391667
## Deg. of Freedom        2        15
## 
## Residual standard error: 0.3045944
## Estimated effects may be unbalanced
summary(my.anova)
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## drug         2  3.453  1.7267   18.61 0.0000865 ***
## Residuals   15  1.392  0.0928                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# calculating the effect size (eta squared)
SStot <- SSb + SSw
eta.squared <- SSb / SStot
eta.squared
## [1] 0.7127623
# or
etaSquared(x = my.anova)
##         eta.sq eta.sq.part
## drug 0.7127623   0.7127623

5.5.1 Multiple comparisons and post hoc tests

# the tidy way
placebo <- clin.trial |>
filter(drug == "placebo") |>
select(mood.gain)
placebo <- placebo[, 1]

anxifree <- clin.trial |>
filter(drug == "anxifree") |>
select(mood.gain)
anxifree <- anxifree[, 1]

joyzepam <- clin.trial |>
filter(drug == "joyzepam") |>
select(mood.gain)
joyzepam <- joyzepam[, 1]

# the non-tidy way
joyzepam <- with(clin.trial, mood.gain[drug == "joyzepam"]) # mood change due to joyzepam
anxifree <- with(clin.trial, mood.gain[drug == "anxifree"]) # mood change due to anxifree
placebo <- with(clin.trial, mood.gain[drug == "placebo"]) # mood change due to placebo

# running "pairwise" t-tests
t.test(anxifree, placebo, var.equal = TRUE) # Student t-test
## 
##  Two Sample t-test
## 
## data:  anxifree and placebo
## t = 1.3542, df = 10, p-value = 0.2055
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1721001  0.7054334
## sample estimates:
## mean of x mean of y 
## 0.7166667 0.4500000
# or
t.test(
  formula = mood.gain ~ drug,
  data = clin.trial,
  subset = drug %in% c("placebo", "anxifree"),
  var.equal = TRUE
)
## 
##  Two Sample t-test
## 
## data:  mood.gain by drug
## t = -1.3542, df = 10, p-value = 0.2055
## alternative hypothesis: true difference in means between group placebo and group anxifree is not equal to 0
## 95 percent confidence interval:
##  -0.7054334  0.1721001
## sample estimates:
##  mean in group placebo mean in group anxifree 
##              0.4500000              0.7166667
# or
pairwise.t.test(
  x = clin.trial$mood.gain, # outcome variable
  g = clin.trial$drug, # grouping variable
  p.adjust.method = "none" # which correction to use
)
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  clin.trial$mood.gain and clin.trial$drug 
## 
##          placebo anxifree
## anxifree 0.15021 -       
## joyzepam 0.00003 0.00056 
## 
## P value adjustment method: none
# or
posthocPairwiseT(x = my.anova, p.adjust.method = "none") # calling pairwise.t.test using an aov object
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  mood.gain and drug 
## 
##          placebo anxifree
## anxifree 0.15021 -       
## joyzepam 0.00003 0.00056 
## 
## P value adjustment method: none
# corrections for multiple testing
# Bonferroni corrections
posthocPairwiseT(my.anova, p.adjust.method = "bonferroni")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  mood.gain and drug 
## 
##          placebo  anxifree
## anxifree 0.4506   -       
## joyzepam 0.000091 0.0017  
## 
## P value adjustment method: bonferroni
# Holm correction
posthocPairwiseT(my.anova)
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  mood.gain and drug 
## 
##          placebo  anxifree
## anxifree 0.1502   -       
## joyzepam 0.000091 0.0011  
## 
## P value adjustment method: holm

5.5.2 Checking the homogeneity of variance assumption

# running the Levene's test
leveneTest(y = my.anova, center = mean) # mean
Df F value Pr(>F)
group 2 1.449739 0.2656941
15 NA NA
# or
leveneTest(y = mood.gain ~ drug, data = clin.trial) # y is a formula in this case
Df F value Pr(>F)
group 2 1.467181 0.2618423
15 NA NA
# or
leveneTest(y = clin.trial$mood.gain, group = clin.trial$drug) # y is the outcome
Df F value Pr(>F)
group 2 1.467181 0.2618423
15 NA NA
# running the Brown-Forsythe test (default)
leveneTest(my.anova) # median
Df F value Pr(>F)
group 2 1.467181 0.2618423
15 NA NA

5.5.3 Removing the homogeneity of variance assumption

# running the Welch one-way test
oneway.test(mood.gain ~ drug, data = clin.trial)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  mood.gain and drug
## F = 26.322, num df = 2.0000, denom df = 9.4932, p-value = 0.000134

5.5.4 Checking the normality assumption

# extract the residuals
my.anova.residuals <- residuals(object = my.anova)
my.anova.residuals
##           1           2           3           4           5           6 
##  0.05000000 -0.15000000 -0.35000000 -0.11666667 -0.31666667 -0.51666667 
##           7           8           9          10          11          12 
## -0.08333333  0.21666667 -0.18333333  0.15000000  0.45000000 -0.15000000 
##          13          14          15          16          17          18 
##  0.38333333  0.08333333  0.48333333  0.31666667 -0.18333333 -0.08333333
# plotting the results to check normality
hist(x = my.anova.residuals)

qqnorm(y = my.anova.residuals)

shapiro.test(x = my.anova.residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  my.anova.residuals
## W = 0.96019, p-value = 0.6053

5.5.5 Removing the normality assumption

# running the Kruskal-Wallis test
kruskal.test(mood.gain ~ drug, data = clin.trial)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  mood.gain by drug
## Kruskal-Wallis chi-squared = 12.076, df = 2, p-value = 0.002386
# or
kruskal.test(x = clin.trial$mood.gain, g = clin.trial$drug)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  clin.trial$mood.gain and clin.trial$drug
## Kruskal-Wallis chi-squared = 12.076, df = 2, p-value = 0.002386
# or
mood.gain <- list(placebo, joyzepam, anxifree)
kruskal.test(x = mood.gain)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  mood.gain
## Kruskal-Wallis chi-squared = 12.076, df = 2, p-value = 0.002386

5.6 Linear regression

# parameter to function drawBasicScatterplot
emphGrey <- grey(.5)

# plotting scatterplots (directly)
# plot(x = parenthood$dan.sleep, y = parenthood$dan.grump,
#      xlab = "Dan Sleep (hours)",
#      ylab = "Dan Grumpiness (0-100)")
# plot(x = parenthood$baby.sleep, y = parenthood$dan.grump)

# function to plot scatterplots
drawBasicScatterplot <- function(dotcol, title) {
  plot(parenthood$dan.sleep,
    parenthood$dan.grump,
    xlab = "Dan Sleep (hours)",
    ylab = "Dan Grumpiness (0-100)",
    col = dotcol,
    main = title,
    font.main = 1,
    pch = 19
  )
}

drawBasicScatterplot(emphGrey, "")

# potting a depiction of the residuals associated with the best fitting regression line
# good regression line
drawBasicScatterplot(emphGrey, "Regression Line Close to the Data")
good.coef <- lm(dan.grump ~ dan.sleep, parenthood)$coef
abline(good.coef, col = ifelse(colour, emphCol, "black"), lwd = 3)
for (i in seq_along(parenthood$dan.sleep)) {
  xval <- parenthood$dan.sleep[i] * c(1, 1)
  yval <- c(parenthood$dan.grump[i], good.coef[1] + good.coef[2] * parenthood$dan.sleep[i])
  lines(xval, yval, type = "l", col = emphGrey)
}

# bad regression line
drawBasicScatterplot(emphGrey, "Regression Line Distant from the Data")
bad.coef <- c(80, -3)
abline(bad.coef, col = ifelse(colour, emphCol, "black"), lwd = 3)
for (i in seq_along(parenthood$dan.sleep)) {
  xval <- parenthood$dan.sleep[i] * c(1, 1)
  yval <- c(parenthood$dan.grump[i], bad.coef[1] + bad.coef[2] * parenthood$dan.sleep[i])
  lines(xval, yval, type = "l", col = emphGrey)
}

# running a simple regression model
regression.1 <- lm(dan.grump ~ dan.sleep, data = parenthood)
regression.1
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep  
##     125.956       -8.937
# running a multiple linear regression
regression.2 <- lm(dan.grump ~ dan.sleep + baby.sleep, data = parenthood)
regression.2
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep   baby.sleep  
##   125.96557     -8.95025      0.01052

5.6.1 Quantifying the fit of the regression model

# The R-squared value
X <- parenthood$dan.sleep # the predictor
Y <- parenthood$dan.grump # the outcome
Y.pred <- -8.94 * X + 125.97

# calculating the sum of squared residuals
SS.resid <- sum((Y - Y.pred)^2)
SS.resid
## [1] 1838.722
# calculating the total sum of squares
SS.tot <- sum((Y - mean(Y))^2)
SS.tot
## [1] 9998.59
# calculating the coefficient of determination or R-squared
R.squared <- 1 - (SS.resid / SS.tot)
R.squared
## [1] 0.8161018
# relation between regression and correlation
# calculation correlation (r-Pearson correlation coefficient)
r <- cor(X, Y)
r^2
## [1] 0.8161027

5.6.2 Hypothesis test for regression models

# compute all the quantities of the regression model
summary(regression.2)
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0345  -2.2198  -0.4016   2.6775  11.7496 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 125.96557    3.04095  41.423 <0.0000000000000002 ***
## dan.sleep    -8.95025    0.55346 -16.172 <0.0000000000000002 ***
## baby.sleep    0.01052    0.27106   0.039               0.969    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.354 on 97 degrees of freedom
## Multiple R-squared:  0.8161, Adjusted R-squared:  0.8123 
## F-statistic: 215.2 on 2 and 97 DF,  p-value: < 0.00000000000000022

5.6.3 Hypothesis test for a single correlation

# running t-test on a coefficient in a regression model
summary(regression.1)
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep, data = parenthood)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.025  -2.213  -0.399   2.681  11.750 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 125.9563     3.0161   41.76 <0.0000000000000002 ***
## dan.sleep    -8.9368     0.4285  -20.85 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.332 on 98 degrees of freedom
## Multiple R-squared:  0.8161, Adjusted R-squared:  0.8142 
## F-statistic: 434.9 on 1 and 98 DF,  p-value: < 0.00000000000000022
# is the same
cor.test(x = parenthood$dan.sleep, y = parenthood$dan.grump)
## 
##  Pearson's product-moment correlation
## 
## data:  parenthood$dan.sleep and parenthood$dan.grump
## t = -20.854, df = 98, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.9340614 -0.8594714
## sample estimates:
##       cor 
## -0.903384

5.6.4 Regarding regression coefficients

# confidence intervals for the coefficients
confint(object = regression.2, level = .99)
##                   0.5 %      99.5 %
## (Intercept) 117.9755724 133.9555593
## dan.sleep   -10.4044419  -7.4960575
## baby.sleep   -0.7016868   0.7227357
# calculating standardised regression coefficients
standardCoefs(regression.2)
##                      b        beta
## dan.sleep  -8.95024973 -0.90474809
## baby.sleep  0.01052447  0.00217223

5.6.5 Model checking

# Residuals
# getting ordinary residuals
residuals(object = regression.2)
##           1           2           3           4           5           6 
##  -2.1403095   4.7081942   1.9553640  -2.0602806   0.7194888  -0.4066133 
##           7           8           9          10          11          12 
##   0.2269987  -1.7003077   0.2025039   3.8524589   3.9986291  -4.9120150 
##          13          14          15          16          17          18 
##   1.2060134   0.4946578  -2.6579276  -0.3966805   3.3538613   1.7261225 
##          19          20          21          22          23          24 
##  -0.4922551  -5.6405941  -0.4660764   2.7238389   9.3653697   0.2841513 
##          25          26          27          28          29          30 
##  -0.5037668  -1.4941146   8.1328623   1.9787316  -1.5126726   3.5171148 
##          31          32          33          34          35          36 
##  -8.9256951  -2.8282946   6.1030349  -7.5460717   4.5572128 -10.6510836 
##          37          38          39          40          41          42 
##  -5.6931846   6.3096506  -2.1082466  -0.5044253   0.1875576   4.8094841 
##          43          44          45          46          47          48 
##  -5.4135163  -6.2292842  -4.5725232  -5.3354601   3.9950111   2.1718745 
##          49          50          51          52          53          54 
##  -3.4766440   0.4834367   6.2839790   2.0109396  -1.5846631  -2.2166613 
##          55          56          57          58          59          60 
##   2.2033140   1.9328736  -1.8301204  -1.5401430   2.5298509  -3.3705782 
##          61          62          63          64          65          66 
##  -2.9380806   0.6590736  -0.5917559  -8.6131971   5.9781035   5.9332979 
##          67          68          69          70          71          72 
##  -1.2341956   3.0047669  -1.0802468   6.5174672  -3.0155469   2.1176720 
##          73          74          75          76          77          78 
##   0.6058757  -2.7237421  -2.2291472  -1.4053822   4.7461491  11.7495569 
##          79          80          81          82          83          84 
##   4.7634141   2.6620908 -11.0345292  -0.7588667   1.4558227  -0.4745727 
##          85          86          87          88          89          90 
##   8.9091201  -1.1409777   0.7555223  -0.4107130   0.8797237  -1.4095586 
##          91          92          93          94          95          96 
##   3.1571385  -3.4205757  -5.7228699  -2.2033958  -3.8647891   0.4982711 
##          97          98          99         100 
##  -5.5249495   4.1134221  -8.2038533   5.6800859
# getting standardised residuals
rstandard(model = regression.2)
##           1           2           3           4           5           6 
## -0.49675845  1.10430571  0.46361264 -0.47725357  0.16756281 -0.09488969 
##           7           8           9          10          11          12 
##  0.05286626 -0.39260381  0.04739691  0.89033990  0.95851248 -1.13898701 
##          13          14          15          16          17          18 
##  0.28047841  0.11519184 -0.61657092 -0.09191865  0.77692937  0.40403495 
##          19          20          21          22          23          24 
## -0.11552373 -1.31540412 -0.10819238  0.62951824  2.17129803  0.06586227 
##          25          26          27          28          29          30 
## -0.11980449 -0.34704024  1.91121833  0.45686516 -0.34986350  0.81233165 
##          31          32          33          34          35          36 
## -2.08659993 -0.66317843  1.42930082 -1.77763064  1.07452436 -2.47385780 
##          37          38          39          40          41          42 
## -1.32715114  1.49419658 -0.49115639 -0.11674947  0.04401233  1.11881912 
##          43          44          45          46          47          48 
## -1.27081641 -1.46422595 -1.06943700 -1.24659673  0.94152881  0.51069809 
##          49          50          51          52          53          54 
## -0.81373349  0.11412178  1.47938594  0.46437962 -0.37157009 -0.51609949 
##          55          56          57          58          59          60 
##  0.51800753  0.44813204 -0.42662358 -0.35575611  0.58403297 -0.78022677 
##          61          62          63          64          65          66 
## -0.67833325  0.15484699 -0.13760574 -2.05662232  1.40238029  1.37505125 
##          67          68          69          70          71          72 
## -0.28964989  0.69497632 -0.24945316  1.50709623 -0.69864682  0.49071427 
##          73          74          75          76          77          78 
##  0.14267297 -0.63246560 -0.51972828 -0.32509811  1.10842574  2.72171671 
##          79          80          81          82          83          84 
##  1.09975101  0.62057080 -2.55172097 -0.17584803  0.34340064 -0.11158952 
##          85          86          87          88          89          90 
##  2.10863391 -0.26386516  0.17624445 -0.09504416  0.20450884 -0.32730740 
##          91          92          93          94          95          96 
##  0.73475640 -0.79400855 -1.32768248 -0.51940736 -0.91512580  0.11661226 
##          97          98          99         100 
## -1.28069115  0.96332849 -1.90290258  1.31368144
# getting Studentised residuals
rstudent(model = regression.2)
##           1           2           3           4           5           6 
## -0.49482102  1.10557030  0.46172854 -0.47534555  0.16672097 -0.09440368 
##           7           8           9          10          11          12 
##  0.05259381 -0.39088553  0.04715251  0.88938019  0.95810710 -1.14075472 
##          13          14          15          16          17          18 
##  0.27914212  0.11460437 -0.61459001 -0.09144760  0.77533036  0.40228555 
##          19          20          21          22          23          24 
## -0.11493461 -1.32043609 -0.10763974  0.62754813  2.21456485  0.06552336 
##          25          26          27          28          29          30 
## -0.11919416 -0.34546127  1.93818473  0.45499388 -0.34827522  0.81089646 
##          31          32          33          34          35          36 
## -2.12403286 -0.66125192  1.43712830 -1.79797263  1.07539064 -2.54258876 
##          37          38          39          40          41          42 
## -1.33244515  1.50388257 -0.48922682 -0.11615428  0.04378531  1.12028904 
##          43          44          45          46          47          48 
## -1.27490649 -1.47302872 -1.07023828 -1.25020935  0.94097261  0.50874322 
##          49          50          51          52          53          54 
## -0.81230544  0.11353962  1.48863006  0.46249410 -0.36991317 -0.51413868 
##          55          56          57          58          59          60 
##  0.51604474  0.44627831 -0.42481754 -0.35414868  0.58203894 -0.77864171 
##          61          62          63          64          65          66 
## -0.67643392  0.15406579 -0.13690795 -2.09211556  1.40949469  1.38147541 
##          67          68          69          70          71          72 
## -0.28827768  0.69311245 -0.24824363  1.51717578 -0.69679156  0.48878534 
##          73          74          75          76          77          78 
##  0.14195054 -0.63049841 -0.51776374 -0.32359434  1.10974786  2.81736616 
##          79          80          81          82          83          84 
##  1.10095270  0.61859288 -2.62827967 -0.17496714  0.34183379 -0.11101996 
##          85          86          87          88          89          90 
##  2.14753375 -0.26259576  0.17536170 -0.09455738  0.20349582 -0.32579584 
##          91          92          93          94          95          96 
##  0.73300184 -0.79248469 -1.33298848 -0.51744314 -0.91435205  0.11601774 
##          97          98          99         100 
## -1.28498273  0.96296745 -1.92942389  1.31867548
# Anomalous data
# getting hat values
hatvalues(model = regression.2)
##          1          2          3          4          5          6          7 
## 0.02067452 0.04105320 0.06155445 0.01685226 0.02734865 0.03129943 0.02735579 
##          8          9         10         11         12         13         14 
## 0.01051224 0.03698976 0.01229155 0.08189763 0.01882551 0.02462902 0.02718388 
##         15         16         17         18         19         20         21 
## 0.01964210 0.01748592 0.01691392 0.03712530 0.04213891 0.02994643 0.02099435 
##         22         23         24         25         26         27         28 
## 0.01233280 0.01853370 0.01804801 0.06722392 0.02214927 0.04472007 0.01039447 
##         29         30         31         32         33         34         35 
## 0.01381812 0.01105817 0.03468260 0.04048248 0.03814670 0.04934440 0.05107803 
##         36         37         38         39         40         41         42 
## 0.02208177 0.02919013 0.05928178 0.02799695 0.01519967 0.04195751 0.02514137 
##         43         44         45         46         47         48         49 
## 0.04267879 0.04517340 0.03558080 0.03360160 0.05019778 0.04587468 0.03701290 
##         50         51         52         53         54         55         56 
## 0.05331282 0.04814477 0.01072699 0.04047386 0.02681315 0.04556787 0.01856997 
##         57         58         59         60         61         62         63 
## 0.02919045 0.01126069 0.01012683 0.01546412 0.01029534 0.04428870 0.02438944 
##         64         65         66         67         68         69         70 
## 0.07469673 0.04135090 0.01775697 0.04217616 0.01384321 0.01069005 0.01340216 
##         71         72         73         74         75         76         77 
## 0.01716361 0.01751844 0.04863314 0.02158623 0.02951418 0.01411915 0.03276064 
##         78         79         80         81         82         83         84 
## 0.01684599 0.01028001 0.02920514 0.01348051 0.01752758 0.05184527 0.04583604 
##         85         86         87         88         89         90         91 
## 0.05825858 0.01359644 0.03054414 0.01487724 0.02381348 0.02159418 0.02598661 
##         92         93         94         95         96         97         98 
## 0.02093288 0.01982480 0.05063492 0.05907629 0.03682026 0.01817919 0.03811718 
##         99        100 
## 0.01945603 0.01373394
# getting Cook's distance values
cooks.distance(model = regression.2)
##             1             2             3             4             5 
## 0.00173651171 0.01740242936 0.00469937006 0.00130141703 0.00026315569 
##             6             7             8             9            10 
## 0.00009697585 0.00002620181 0.00054584906 0.00002876269 0.00328827681 
##            11            12            13            14            15 
## 0.02731835260 0.00829691897 0.00066214792 0.00012359557 0.00253891456 
##            16            17            18            19            20 
## 0.00005012283 0.00346174151 0.00209805468 0.00019570497 0.01780518526 
##            21            22            23            24            25 
## 0.00008367377 0.00164947783 0.02967593755 0.00002657610 0.00034480325 
##            26            27            28            29            30 
## 0.00090933792 0.05699951226 0.00073079434 0.00057169976 0.00245956350 
##            31            32            33            34            35 
## 0.05214331478 0.00618519973 0.02700686248 0.05467344690 0.02071643055 
##            36            37            38            39            40 
## 0.04606378152 0.01765311655 0.04689816875 0.00231612212 0.00007012530 
##            41            42            43            44            45 
## 0.00002827824 0.01076083119 0.02399930996 0.03381062226 0.01406497804 
##            46            47            48            49            50 
## 0.01801086321 0.01561698982 0.00417998637 0.00848351357 0.00024447866 
##            51            52            53            54            55 
## 0.03689945760 0.00077944724 0.00194123453 0.00244622954 0.00427036054 
##            56            57            58            59            60 
## 0.00126660900 0.00182421163 0.00048047048 0.00116318135 0.00318723481 
##            61            62            63            64            65 
## 0.00159551161 0.00037038258 0.00015778917 0.11381653127 0.02827715166 
##            66            67            68            69            70 
## 0.01139374043 0.00123142189 0.00226000648 0.00022413221 0.01028478940 
##            71            72            73            74            75 
## 0.00284132858 0.00143122276 0.00034685379 0.00294175691 0.00273824884 
##            76            77            78            79            80 
## 0.00050453567 0.01387108351 0.04230965547 0.00418744027 0.00386183094 
##            81            82            83            84            85 
## 0.02965826018 0.00018388884 0.00214936854 0.00019939290 0.09168733246 
##            86            87            88            89            90 
## 0.00031989944 0.00032621922 0.00004547383 0.00034008931 0.00078814872 
##            91            92            93            94            95 
## 0.00480120376 0.00449309454 0.01188426604 0.00479636047 0.01752665888 
##            96            97            98            99           100 
## 0.00017327933 0.01012301719 0.01225818302 0.02394963622 0.00801050779
# test from car package to see if any of the Studentised residuals are significantly larger
outlierTest(model = regression.2)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##    rstudent unadjusted p-value Bonferroni p
## 78 2.817366          0.0058788      0.58788
# getting rid of outliers using subset
lm(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood, subset = -64)
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood, 
##     subset = -64)
## 
## Coefficients:
## (Intercept)    dan.sleep   baby.sleep  
##    126.3553      -8.8283      -0.1319
# plotting anomalous data using plot()
plot(x = regression.2, which = 4) # Cook's distance 

plot(x = regression.2, which = 5) # the Studentised residuals agains leverage

# using the car package
influenceIndexPlot(model = regression.2)

influencePlot(model = regression.2)
StudRes Hat CookD
11 0.9581071 0.0818976 0.0273184
64 -2.0921156 0.0746967 0.1138165
78 2.8173662 0.0168460 0.0423097
81 -2.6282797 0.0134805 0.0296583
85 2.1475338 0.0582586 0.0916873
# Checking the normality of the residuals
# plotting histogram
hist(x = residuals(regression.2),
     xlab = "Value of residual",
     main = "",
     breaks = 20)

# plotting qq-plot
plot(x = regression.2, which = 2)

# Checking the linearity of the relationship
# plotting the relationship between the fitted values and the observed values
yhat.2 <- fitted.values(object = regression.2)
plot(x = yhat.2,
     y = parenthood$dan.grump,
     xlab = "Fitted Values",
     ylab = "Observed Values")

# plotting the relationship between the the fitted values and the residuals themselves
plot(x = regression.2, which = 1)

# with residualPlots() function from car package
residualPlots(model = regression.2)

##            Test stat Pr(>|Test stat|)  
## dan.sleep     2.1604          0.03323 *
## baby.sleep   -0.5445          0.58733  
## Tukey test    2.1615          0.03066 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Checking the homogeneity of variance
plot(x = regression.2, which = 3)

# running the non-constant variance test
ncvTest(regression.2)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.09317511, Df = 1, p = 0.76018
# running sandwich estimators if homogeneity of variance is violated
coeftest(regression.2, vcov. = hccm)
## 
## t test of coefficients:
## 
##               Estimate Std. Error  t value            Pr(>|t|)    
## (Intercept) 125.965566   3.247285  38.7910 <0.0000000000000002 ***
## dan.sleep    -8.950250   0.615820 -14.5339 <0.0000000000000002 ***
## baby.sleep    0.010524   0.291565   0.0361              0.9713    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Checking for collinearity
# calculating the variance inflation factors (VIFs)
vif(mod = regression.2)
##  dan.sleep baby.sleep 
##   1.651038   1.651038
# example of getting collinearity
regression.3 <- lm(day ~ baby.sleep + dan.sleep + dan.grump, parenthood)
vif(mod = regression.3)
## baby.sleep  dan.sleep  dan.grump 
##   1.651064   6.102337   5.437903

5.6.6 Model selection

# the backward elimination approach
# 1. get the complete regression model
full.model <- lm(formula = dan.grump ~ dan.sleep + baby.sleep + day, data = parenthood)

# 2. running for the first time the step() function: backward
step(object = full.model, direction = "backward")
## Start:  AIC=299.08
## dan.grump ~ dan.sleep + baby.sleep + day
## 
##              Df Sum of Sq    RSS    AIC
## - baby.sleep  1       0.1 1837.2 297.08
## - day         1       1.6 1838.7 297.16
## <none>                    1837.1 299.08
## - dan.sleep   1    4909.0 6746.1 427.15
## 
## Step:  AIC=297.08
## dan.grump ~ dan.sleep + day
## 
##             Df Sum of Sq    RSS    AIC
## - day        1       1.6 1838.7 295.17
## <none>                   1837.2 297.08
## - dan.sleep  1    8103.0 9940.1 463.92
## 
## Step:  AIC=295.17
## dan.grump ~ dan.sleep
## 
##             Df Sum of Sq    RSS    AIC
## <none>                   1838.7 295.17
## - dan.sleep  1    8159.9 9998.6 462.50
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep  
##     125.956       -8.937
# running step() forward
null.model <- lm(dan.grump ~ 1, parenthood)
step(object = null.model, direction = "forward", scope = dan.grump ~ dan.sleep + baby.sleep + day)
## Start:  AIC=462.5
## dan.grump ~ 1
## 
##              Df Sum of Sq    RSS    AIC
## + dan.sleep   1    8159.9 1838.7 295.17
## + baby.sleep  1    3202.7 6795.9 425.89
## <none>                    9998.6 462.50
## + day         1      58.5 9940.1 463.92
## 
## Step:  AIC=295.17
## dan.grump ~ dan.sleep
## 
##              Df Sum of Sq    RSS    AIC
## <none>                    1838.7 295.17
## + day         1   1.55760 1837.2 297.08
## + baby.sleep  1   0.02858 1838.7 297.16
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep  
##     125.956       -8.937
# comparing two regression models
# 1. running the regressions
M0 <- lm(dan.grump ~ dan.sleep + day, parenthood)
M1 <- lm(dan.grump ~ dan.sleep + day + baby.sleep, parenthood)

# A. based on a model selection criterion (AIC)
AIC(M0, M1)
df AIC
M0 4 582.8681
M1 5 584.8646
# B. calculating the F statistic
anova(M0, M1)
Res.Df RSS Df Sum of Sq F Pr(>F)
97 1837.156 NA NA NA NA
96 1837.092 1 0.0636879 0.0033281 0.9541157

6 Linting

The code in this RMarkdown is linted with the lintr package, which is based on the tidyverse style guide.

# lintr::lint("main.Rmd", linters =
#               lintr::with_defaults(
#                 commented_code_linter = NULL,
#                 trailing_whitespace_linter = NULL
#                 )
#             )
# if you have additional scripts and want them to be linted too, add them here
# lintr::lint("scripts/my_script.R")